summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile54
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/async.c159
-rw-r--r--kernel/audit.c40
-rw-r--r--kernel/audit_tree.c36
-rw-r--r--kernel/audit_watch.c6
-rw-r--r--kernel/auditfilter.c1
-rw-r--r--kernel/auditsc.c20
-rw-r--r--kernel/cgroup.c328
-rw-r--r--kernel/compat.c106
-rw-r--r--kernel/context_tracking.c114
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c884
-rw-r--r--kernel/cred.c154
-rw-r--r--kernel/debug/debug_core.c1
-rw-r--r--kernel/debug/debug_core.h2
-rw-r--r--kernel/debug/gdbstub.c4
-rw-r--r--kernel/debug/kdb/kdb_bp.c20
-rw-r--r--kernel/debug/kdb/kdb_debugger.c25
-rw-r--r--kernel/debug/kdb/kdb_main.c137
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/events/core.c45
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/uprobes.c466
-rw-r--r--kernel/exit.c28
-rw-r--r--kernel/fork.c107
-rw-r--r--kernel/futex.c5
-rw-r--r--kernel/futex_compat.c21
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c38
-rw-r--r--kernel/irq/chip.c30
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/spurious.c7
-rw-r--r--kernel/irq_work.c150
-rw-r--r--kernel/kcmp.c1
-rw-r--r--kernel/kexec.c78
-rw-r--r--kernel/kfifo.c609
-rw-r--r--kernel/kmod.c15
-rw-r--r--kernel/kprobes.c66
-rw-r--r--kernel/lockdep.c32
-rw-r--r--kernel/modsign_certificate.S19
-rw-r--r--kernel/modsign_pubkey.c21
-rw-r--r--kernel/module.c689
-rw-r--r--kernel/mutex.c1
-rw-r--r--kernel/nsproxy.c37
-rw-r--r--kernel/padata.c5
-rw-r--r--kernel/panic.c34
-rw-r--r--kernel/pid.c74
-rw-r--r--kernel/pid_namespace.c117
-rw-r--r--kernel/posix-cpu-timers.c54
-rw-r--r--kernel/posix-timers.c27
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/qos.c9
-rw-r--r--kernel/power/suspend.c69
-rw-r--r--kernel/power/suspend_test.c11
-rw-r--r--kernel/printk.c81
-rw-r--r--kernel/profile.c24
-rw-r--r--kernel/ptrace.c93
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c60
-rw-r--r--kernel/rcutiny.c8
-rw-r--r--kernel/rcutiny_plugin.h56
-rw-r--r--kernel/rcutorture.c66
-rw-r--r--kernel/rcutree.c260
-rw-r--r--kernel/rcutree.h11
-rw-r--r--kernel/rcutree_plugin.h13
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/res_counter.c20
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c1
-rw-r--r--kernel/rwsem.c10
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c272
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c314
-rw-r--r--kernel/sched/debug.c101
-rw-r--r--kernel/sched/fair.c277
-rw-r--r--kernel/sched/features.h11
-rw-r--r--kernel/sched/rt.c28
-rw-r--r--kernel/sched/sched.h14
-rw-r--r--kernel/sched/stats.c79
-rw-r--r--kernel/seccomp.c13
-rw-r--r--kernel/signal.c453
-rw-r--r--kernel/smp.c192
-rw-r--r--kernel/smpboot.c7
-rw-r--r--kernel/softirq.c23
-rw-r--r--kernel/srcu.c37
-rw-r--r--kernel/stop_machine.c156
-rw-r--r--kernel/sys.c311
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c72
-rw-r--r--kernel/sysctl_binary.c45
-rw-r--r--kernel/time.c12
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/ntp.c48
-rw-r--r--kernel/time/tick-broadcast.c38
-rw-r--r--kernel/time/tick-sched.c14
-rw-r--r--kernel/time/timekeeping.c71
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig33
-rw-r--r--kernel/trace/blktrace.c30
-rw-r--r--kernel/trace/ftrace.c164
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c114
-rw-r--r--kernel/trace/trace.c330
-rw-r--r--kernel/trace/trace.h134
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_functions.c61
-rw-r--r--kernel/trace/trace_functions_graph.c68
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_selftest.c21
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c61
-rw-r--r--kernel/trace/trace_uprobe.c225
-rw-r--r--kernel/tracepoint.c6
-rw-r--r--kernel/tsacct.c44
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c9
-rw-r--r--kernel/user_namespace.c207
-rw-r--r--kernel/utsname.c36
-rw-r--r--kernel/utsname_sysctl.c3
-rw-r--r--kernel/watchdog.c35
-rw-r--r--kernel/workqueue.c1533
-rw-r--r--kernel/workqueue_internal.h65
-rw-r--r--kernel/workqueue_sched.h9
136 files changed, 6950 insertions, 5096 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ac0d533eb7de..bbde5f1a4486 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
rcupdate.o extable.o params.o posix-timers.o \
- kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o cred.o \
async.o range.o groups.o lglock.o smpboot.o
@@ -25,9 +25,7 @@ endif
obj-y += sched/
obj-y += power/
-ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
-obj-$(CONFIG_X86) += kcmp.o
-endif
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -54,7 +52,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
@@ -127,20 +125,32 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_timeconst = TIMEC $@
- cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
targets += timeconst.h
-$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
- $(call if_changed,timeconst)
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
ifeq ($(CONFIG_MODULE_SIG),y)
#
# Pull the signing certificate and any extra certificates into the kernel
#
+
+quiet_cmd_touch = TOUCH $@
+ cmd_touch = touch $@
+
extra_certificates:
- touch $@
+ $(call cmd,touch)
-kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
+kernel/modsign_certificate.o: signing_key.x509 extra_certificates
###############################################################################
#
@@ -149,23 +159,7 @@ kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
# fail and that the kernel may be used afterwards.
#
###############################################################################
-sign_key_with_hash :=
-ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
-sign_key_with_hash := -sha1
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
-sign_key_with_hash := -sha224
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
-sign_key_with_hash := -sha256
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
-sign_key_with_hash := -sha384
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
-sign_key_with_hash := -sha512
-endif
-ifeq ($(sign_key_with_hash),)
+ifndef CONFIG_MODULE_SIG_HASH
$(error Could not determine digest type to use from kernel config)
endif
@@ -178,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey
@echo "### needs to be run as root, and uses a hardware random"
@echo "### number generator if one is available."
@echo "###"
- openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
- -x509 -config x509.genkey \
+ openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
+ -batch -x509 -config x509.genkey \
-outform DER -out signing_key.x509 \
-keyout signing_key.priv
@echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e7..b9bd7f098ee5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname)
if (IS_ERR(file))
return PTR_ERR(file);
- if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+ if (!S_ISREG(file_inode(file)->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
@@ -566,6 +566,7 @@ out:
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ cputime_t utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
@@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
- pacct->ac_utime += current->utime;
- pacct->ac_stime += current->stime;
+ task_cputime(current, &utime, &stime);
+ pacct->ac_utime += utime;
+ pacct->ac_stime += stime;
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 9d3118384858..8ddee2c3e5b0 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,56 +57,52 @@ asynchronous and synchronous parts of the kernel.
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include "workqueue_internal.h"
+
static async_cookie_t next_cookie = 1;
-#define MAX_WORK 32768
+#define MAX_WORK 32768
+#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
-static LIST_HEAD(async_pending);
-static ASYNC_DOMAIN(async_running);
-static LIST_HEAD(async_domains);
+static LIST_HEAD(async_global_pending); /* pending from all registered doms */
+static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
-static DEFINE_MUTEX(async_register_mutex);
struct async_entry {
- struct list_head list;
+ struct list_head domain_list;
+ struct list_head global_list;
struct work_struct work;
async_cookie_t cookie;
async_func_ptr *func;
void *data;
- struct async_domain *running;
+ struct async_domain *domain;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
-
-/*
- * MUST be called with the lock held!
- */
-static async_cookie_t __lowest_in_progress(struct async_domain *running)
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- struct async_entry *entry;
-
- if (!list_empty(&running->domain)) {
- entry = list_first_entry(&running->domain, typeof(*entry), list);
- return entry->cookie;
- }
+ struct async_entry *first = NULL;
+ async_cookie_t ret = ASYNC_COOKIE_MAX;
+ unsigned long flags;
- list_for_each_entry(entry, &async_pending, list)
- if (entry->running == running)
- return entry->cookie;
+ spin_lock_irqsave(&async_lock, flags);
- return next_cookie; /* "infinity" value */
-}
+ if (domain) {
+ if (!list_empty(&domain->pending))
+ first = list_first_entry(&domain->pending,
+ struct async_entry, domain_list);
+ } else {
+ if (!list_empty(&async_global_pending))
+ first = list_first_entry(&async_global_pending,
+ struct async_entry, global_list);
+ }
-static async_cookie_t lowest_in_progress(struct async_domain *running)
-{
- unsigned long flags;
- async_cookie_t ret;
+ if (first)
+ ret = first->cookie;
- spin_lock_irqsave(&async_lock, flags);
- ret = __lowest_in_progress(running);
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
@@ -120,14 +116,8 @@ static void async_run_entry_fn(struct work_struct *work)
container_of(work, struct async_entry, work);
unsigned long flags;
ktime_t uninitialized_var(calltime), delta, rettime;
- struct async_domain *running = entry->running;
- /* 1) move self to the running queue */
- spin_lock_irqsave(&async_lock, flags);
- list_move_tail(&entry->list, &running->domain);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* 2) run (and print duration) */
+ /* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
(long long)entry->cookie,
@@ -144,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work)
(long long)ktime_to_ns(delta) >> 10);
}
- /* 3) remove self from the running queue */
+ /* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
- list_del(&entry->list);
- if (running->registered && --running->count == 0)
- list_del_init(&running->node);
+ list_del_init(&entry->domain_list);
+ list_del_init(&entry->global_list);
- /* 4) free the entry */
+ /* 3) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* 5) wake up any waiters */
+ /* 4) wake up any waiters */
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
@@ -183,19 +172,28 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
ptr(data, newcookie);
return newcookie;
}
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
entry->func = ptr;
entry->data = data;
- entry->running = running;
+ entry->domain = domain;
spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
newcookie = entry->cookie = next_cookie++;
- list_add_tail(&entry->list, &async_pending);
- if (running->registered && running->count++ == 0)
- list_add_tail(&running->node, &async_domains);
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
+ /* mark that this task has queued an async job, used by module init */
+ current->flags |= PF_USED_ASYNC;
+
/* schedule for execution */
queue_work(system_unbound_wq, &entry->work);
@@ -212,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
*/
async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
{
- return __async_schedule(ptr, data, &async_running);
+ return __async_schedule(ptr, data, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule);
@@ -220,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule);
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
* @ptr: function to execute asynchronously
* @data: data pointer to pass to the function
- * @running: running list for the domain
+ * @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_domain() functions
- * to wait within a certain synchronization domain rather than globally.
- * A synchronization domain is specified via the running queue @running to use.
- * Note: This function may be called from atomic or non-atomic contexts.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally. A
+ * synchronization domain is specified via @domain. Note: This function
+ * may be called from atomic or non-atomic contexts.
*/
async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
- struct async_domain *running)
+ struct async_domain *domain)
{
- return __async_schedule(ptr, data, running);
+ return __async_schedule(ptr, data, domain);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);
@@ -242,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
*/
void async_synchronize_full(void)
{
- mutex_lock(&async_register_mutex);
- do {
- struct async_domain *domain = NULL;
-
- spin_lock_irq(&async_lock);
- if (!list_empty(&async_domains))
- domain = list_first_entry(&async_domains, typeof(*domain), node);
- spin_unlock_irq(&async_lock);
-
- async_synchronize_cookie_domain(next_cookie, domain);
- } while (!list_empty(&async_domains));
- mutex_unlock(&async_register_mutex);
+ async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
@@ -268,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);
*/
void async_unregister_domain(struct async_domain *domain)
{
- mutex_lock(&async_register_mutex);
spin_lock_irq(&async_lock);
- WARN_ON(!domain->registered || !list_empty(&domain->node) ||
- !list_empty(&domain->domain));
+ WARN_ON(!domain->registered || !list_empty(&domain->pending));
domain->registered = 0;
spin_unlock_irq(&async_lock);
- mutex_unlock(&async_register_mutex);
}
EXPORT_SYMBOL_GPL(async_unregister_domain);
/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @domain: running list to synchronize on
+ * @domain: the domain to synchronize
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @domain have been done.
+ * synchronization domain specified by @domain have been done.
*/
void async_synchronize_full_domain(struct async_domain *domain)
{
- async_synchronize_cookie_domain(next_cookie, domain);
+ async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @domain: the domain to synchronize (%NULL for all registered domains)
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by running list @running submitted
- * prior to @cookie have been done.
+ * synchronization domain specified by @domain submitted prior to @cookie
+ * have been done.
*/
-void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
ktime_t uninitialized_var(starttime), delta, endtime;
- if (!running)
- return;
-
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
- wait_event(async_done, lowest_in_progress(running) >= cookie);
+ wait_event(async_done, lowest_in_progress(domain) >= cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
endtime = ktime_get();
@@ -334,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
- async_synchronize_cookie_domain(cookie, &async_running);
+ async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
+{
+ struct worker *worker = current_wq_worker();
+
+ return worker && worker->current_func == async_run_entry_fn;
+}
diff --git a/kernel/audit.c b/kernel/audit.c
index 40414e9143db..d596e5355f15 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -272,6 +272,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
int rc = 0;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return rc;
audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
old, from_kuid(&init_user_ns, loginuid), sessionid);
if (sid) {
@@ -619,6 +621,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+ if (unlikely(!*ab))
+ return rc;
audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
task_tgid_vnr(current),
from_kuid(&init_user_ns, current_uid()),
@@ -1097,6 +1101,23 @@ static inline void audit_get_stamp(struct audit_context *ctx,
}
}
+/*
+ * Wait for auditd to drain the queue a little
+ */
+static void wait_for_auditd(unsigned long sleep_time)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&audit_backlog_wait, &wait);
+
+ if (audit_backlog_limit &&
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ schedule_timeout(sleep_time);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+}
+
/* Obtain an audit buffer. This routine does locking to obtain the
* audit buffer, but then no locking is required for calls to
* audit_log_*format. If the tsk is a task that is currently in a
@@ -1142,20 +1163,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
while (audit_backlog_limit
&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
- && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
+ if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ unsigned long sleep_time;
- /* Wait for auditd to drain the queue a little */
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&audit_backlog_wait, &wait);
-
- if (audit_backlog_limit &&
- skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
- schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&audit_backlog_wait, &wait);
+ sleep_time = timeout_start + audit_backlog_wait_time -
+ jiffies;
+ if ((long)sleep_time > 0)
+ wait_for_auditd(sleep_time);
continue;
}
if (audit_rate_check() && printk_ratelimit())
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ed206fd88cca..642a89c4f3d6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -249,7 +249,7 @@ static void untag_chunk(struct node *p)
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
goto out;
}
@@ -291,7 +291,7 @@ static void untag_chunk(struct node *p)
owner->root = new;
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
@@ -331,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
chunk->dead = 1;
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(entry);
return 0;
}
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
- fsnotify_destroy_mark(chunk_entry);
+ fsnotify_destroy_mark(chunk_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
@@ -443,17 +443,32 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
- fsnotify_destroy_mark(old_entry);
+ fsnotify_destroy_mark(old_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry); /* drop initial reference */
fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
return 0;
}
+static void audit_log_remove_rule(struct audit_krule *rule)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "op=");
+ audit_log_string(ab, "remove rule");
+ audit_log_format(ab, " dir=");
+ audit_log_untrustedstring(ab, rule->tree->pathname);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
- struct audit_buffer *ab;
list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
entry = container_of(rule, struct audit_entry, rule);
@@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "op=");
- audit_log_string(ab, "remove rule");
- audit_log_format(ab, " dir=");
- audit_log_untrustedstring(ab, rule->tree->pathname);
- audit_log_key(ab, rule->filterkey);
- audit_log_format(ab, " list=%d res=1", rule->listnr);
- audit_log_end(ab);
+ audit_log_remove_rule(rule);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9a9ae6e3d290..22831c4d369c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -240,6 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
if (audit_enabled) {
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab, "auid=%u ses=%u op=",
from_kuid(&init_user_ns, audit_get_loginuid(current)),
audit_get_sessionid(current));
@@ -350,7 +352,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
}
mutex_unlock(&audit_filter_mutex);
- fsnotify_destroy_mark(&parent->mark);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
}
/* Get path information necessary for adding watches. */
@@ -457,7 +459,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
if (list_empty(&parent->watches)) {
audit_get_parent(parent);
- fsnotify_destroy_mark(&parent->mark);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
audit_put_parent(parent);
}
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7f19f23d38a3..f9fc54bbe06f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1144,7 +1144,6 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
* audit_receive_filter - apply all rules to the specified message type
* @type: audit message type
* @pid: target pid for netlink audit messages
- * @uid: target uid for netlink audit messages
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e37e6a12c5e3..a371f857a0a9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1464,14 +1464,14 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
ab = audit_log_start(context, GFP_KERNEL,
AUDIT_IPC_SET_PERM);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab,
"qbytes=%lx ouid=%u ogid=%u mode=%#ho",
context->ipc.qbytes,
context->ipc.perm_uid,
context->ipc.perm_gid,
context->ipc.perm_mode);
- if (!ab)
- return;
}
break; }
case AUDIT_MQ_OPEN: {
@@ -2675,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
-static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
kgid_t gid;
@@ -2693,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
audit_log_task_context(ab);
audit_log_format(ab, " pid=%d comm=", current->pid);
audit_log_untrustedstring(ab, current->comm);
+}
+
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+ audit_log_task(ab);
audit_log_format(ab, " reason=");
audit_log_string(ab, reason);
audit_log_format(ab, " sig=%ld", signr);
@@ -2715,6 +2720,8 @@ void audit_core_dumps(long signr)
return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+ if (unlikely(!ab))
+ return;
audit_log_abend(ab, "memory violation", signr);
audit_log_end(ab);
}
@@ -2723,8 +2730,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- audit_log_abend(ab, "seccomp", signr);
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld", signr);
audit_log_format(ab, " syscall=%ld", syscall);
audit_log_format(ab, " compat=%d", is_compat_task());
audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f34c41bfaa37..a32f9432666c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
-#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
int i;
- int index;
- unsigned long tmp = 0UL;
+ unsigned long key = 0UL;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- tmp += (unsigned long)css[i];
- tmp = (tmp >> 16) ^ tmp;
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
- index = hash_long(tmp, CSS_SET_HASH_BITS);
-
- return &css_set_table[index];
+ return key;
}
/* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
}
/* This css_set is dead. unlink it and release cgroup refcounts */
- hlist_del(&cg->hlist);
+ hash_del(&cg->hlist);
css_set_count--;
list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
struct cgroup *cgrp = link->cgrp;
list_del(&link->cg_link_list);
list_del(&link->cgrp_link_list);
+
+ /*
+ * We may not be holding cgroup_mutex, and if cgrp->count is
+ * dropped to 0 the cgroup can be destroyed at any time, hence
+ * rcu_read_lock is used to keep it alive.
+ */
+ rcu_read_lock();
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
+ rcu_read_unlock();
kfree(link);
}
@@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set(
{
int i;
struct cgroupfs_root *root = cgrp->root;
- struct hlist_head *hhead;
- struct hlist_node *node;
struct css_set *cg;
+ unsigned long key;
/*
* Build the set of subsystem state objects that we want to see in the
@@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set(
}
}
- hhead = css_set_hash(template);
- hlist_for_each_entry(cg, node, hhead, hlist) {
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cg, hlist, key) {
if (!compare_css_sets(cg, oldcg, cgrp, template))
continue;
@@ -657,8 +660,8 @@ static struct css_set *find_css_set(
struct list_head tmp_cg_links;
- struct hlist_head *hhead;
struct cg_cgroup_link *link;
+ unsigned long key;
/* First see if we already have a cgroup group that matches
* the desired set */
@@ -704,8 +707,8 @@ static struct css_set *find_css_set(
css_set_count++;
/* Add this cgroup group to the hash table */
- hhead = css_set_hash(res->subsys);
- hlist_add_head(&res->hlist, hhead);
+ key = css_set_hash(res->subsys);
+ hash_add(css_set_table, &res->hlist, key);
write_unlock(&css_set_lock);
@@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
return inode;
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static void cgroup_free_fn(struct work_struct *work)
{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
- struct cgroup_subsys *ss;
- BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();
+ struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+ struct cgroup_subsys *ss;
- mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_subsys(cgrp->root, ss)
- ss->css_free(cgrp);
+ mutex_lock(&cgroup_mutex);
+ /*
+ * Release the subsystem state objects.
+ */
+ for_each_subsys(cgrp->root, ss)
+ ss->css_free(cgrp);
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
+ cgrp->root->number_of_cgroups--;
+ mutex_unlock(&cgroup_mutex);
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup
- */
- deactivate_super(cgrp->root->sb);
+ /*
+ * Drop the active superblock reference that we took when we
+ * created the cgroup
+ */
+ deactivate_super(cgrp->root->sb);
- /*
- * if we're getting rid of the cgroup, refcount should ensure
- * that there are no pidlists left.
- */
- BUG_ON(!list_empty(&cgrp->pidlists));
+ /*
+ * if we're getting rid of the cgroup, refcount should ensure
+ * that there are no pidlists left.
+ */
+ BUG_ON(!list_empty(&cgrp->pidlists));
+
+ simple_xattrs_free(&cgrp->xattrs);
+
+ ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+ kfree(cgrp);
+}
- simple_xattrs_free(&cgrp->xattrs);
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+ struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+
+ schedule_work(&cgrp->free_work);
+}
+
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+ /* is dentry a directory ? if so, kfree() associated cgroup */
+ if (S_ISDIR(inode->i_mode)) {
+ struct cgroup *cgrp = dentry->d_fsdata;
- ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
- kfree_rcu(cgrp, rcu_head);
+ BUG_ON(!(cgroup_is_removed(cgrp)));
+ call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
} else {
struct cfent *cfe = __d_cfe(dentry);
struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d)
dput(parent);
}
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
struct cfent *cfe;
lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
+ /*
+ * If we're doing cleanup due to failure of cgroup_create(),
+ * the corresponding @cfe may not exist.
+ */
list_for_each_entry(cfe, &cgrp->files, node) {
struct dentry *d = cfe->dentry;
@@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
list_del_init(&cfe->node);
dput(d);
- return 0;
+ break;
}
- return -ENOENT;
}
/**
@@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
}
}
root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
- synchronize_rcu();
return 0;
}
@@ -1333,7 +1345,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto out_unlock;
- /* See feature-removal-schedule.txt */
if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
@@ -1394,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->allcg_node);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
+ INIT_WORK(&cgrp->free_work, cgroup_free_fn);
mutex_init(&cgrp->pidlist_mutex);
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
@@ -1598,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
struct cgroupfs_root *existing_root;
const struct cred *cred;
int i;
+ struct css_set *cg;
BUG_ON(sb->s_root != NULL);
@@ -1651,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
/* Link the top cgroup in this hierarchy into all
* the css_set objects */
write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct hlist_head *hhead = &css_set_table[i];
- struct hlist_node *node;
- struct css_set *cg;
-
- hlist_for_each_entry(cg, node, hhead, hlist)
- link_css_set(&tmp_cg_links, cg, root_cgrp);
- }
+ hash_for_each(css_set_table, i, cg, hlist)
+ link_css_set(&tmp_cg_links, cg, root_cgrp);
write_unlock(&css_set_lock);
free_cg_links(&tmp_cg_links);
@@ -1774,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
"cgroup_path() called without proper locking");
- if (!dentry || cgrp == dummytop) {
+ if (cgrp == dummytop) {
/*
* Inactive subsystems have no dentry for their root
* cgroup
@@ -1983,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
ss->attach(cgrp, &tset);
}
- synchronize_rcu();
out:
if (retval) {
for_each_subsys(root, ss) {
@@ -2152,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
/*
* step 5: success! and cleanup
*/
- synchronize_rcu();
retval = 0;
out_put_css_set_refs:
if (retval) {
@@ -2638,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
*/
static inline struct cftype *__file_cft(struct file *file)
{
- if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+ if (file_inode(file)->i_fop != &cgroup_file_operations)
return ERR_PTR(-EINVAL);
return __d_cft(file->f_dentry);
}
@@ -2770,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
continue;
- if (is_add)
+ if (is_add) {
err = cgroup_add_file(cgrp, subsys, cft);
- else
- err = cgroup_rm_file(cgrp, cft);
- if (err) {
- pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
- is_add ? "add" : "remove", cft->name, err);
+ if (err)
+ pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+ cft->name, err);
ret = err;
+ } else {
+ cgroup_rm_file(cgrp, cft);
}
}
return ret;
@@ -3018,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
}
EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos. If there's no descendant,
+ * @pos is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+ struct cgroup *last, *tmp;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ list_for_each_entry_rcu(tmp, &last->children, sibling)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+
static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
{
struct cgroup *last;
@@ -3409,7 +3440,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = current->nsproxy->pid_ns;
+ struct pid_namespace *ns = task_active_pid_ns(current);
/*
* We can't drop the pidlist_mutex before taking the l->mutex in case
@@ -3753,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work)
remove);
struct cgroup *cgrp = event->cgrp;
+ remove_wait_queue(event->wqh, &event->wait);
+
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
eventfd_ctx_put(event->eventfd);
kfree(event);
dput(cgrp->dentry);
@@ -3774,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
- __remove_wait_queue(event->wqh, &event->wait);
- spin_lock(&cgrp->event_list_lock);
- list_del_init(&event->list);
- spin_unlock(&cgrp->event_list_lock);
/*
- * We are in atomic context, but cgroup_event_remove() may
- * sleep, so we have to call it in workqueue.
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
*/
- schedule_work(&event->remove);
+ spin_lock(&cgrp->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&cgrp->event_list_lock);
}
return 0;
@@ -3808,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
struct cgroup_event *event = NULL;
+ struct cgroup *cgrp_cfile;
unsigned int efd, cfd;
struct file *efile = NULL;
struct file *cfile = NULL;
@@ -3853,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
+ ret = inode_permission(file_inode(cfile), MAY_READ);
if (ret < 0)
goto fail;
@@ -3863,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
goto fail;
}
+ /*
+ * The file to be monitored must be in the same cgroup as
+ * cgroup.event_control is.
+ */
+ cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+ if (cgrp_cfile != cgrp) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto fail;
@@ -4136,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
init_cgroup_housekeeping(cgrp);
+ dentry->d_fsdata = cgrp;
+ cgrp->dentry = dentry;
+
cgrp->parent = parent;
cgrp->root = parent->root;
cgrp->top_cgroup = parent->top_cgroup;
@@ -4173,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
lockdep_assert_held(&dentry->d_inode->i_mutex);
/* allocation complete, commit to creation */
- dentry->d_fsdata = cgrp;
- cgrp->dentry = dentry;
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
root->number_of_cgroups++;
@@ -4341,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace. Use
- * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
- * cgroup_event_wake() is called with the wait queue head locked,
- * remove_wait_queue() cannot be called while holding event_list_lock.
+ * directory to avoid race between userspace and kernelspace.
*/
spin_lock(&cgrp->event_list_lock);
- list_splice_init(&cgrp->event_list, &tmp_list);
- spin_unlock(&cgrp->event_list_lock);
- list_for_each_entry_safe(event, tmp, &tmp_list, list) {
+ list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
list_del_init(&event->list);
- remove_wait_queue(event->wqh, &event->wait);
- eventfd_signal(event->eventfd, 1);
schedule_work(&event->remove);
}
+ spin_unlock(&cgrp->event_list_lock);
return 0;
}
@@ -4439,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
int i, ret;
+ struct hlist_node *tmp;
+ struct css_set *cg;
+ unsigned long key;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4504,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
* this is all done under the css_set_lock.
*/
write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct css_set *cg;
- struct hlist_node *node, *tmp;
- struct hlist_head *bucket = &css_set_table[i], *new_bucket;
-
- hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
- /* skip entries that we already rehashed */
- if (cg->subsys[ss->subsys_id])
- continue;
- /* remove existing entry */
- hlist_del(&cg->hlist);
- /* set new value */
- cg->subsys[ss->subsys_id] = css;
- /* recompute hash and restore entry */
- new_bucket = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, new_bucket);
- }
+ hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
+ /* skip entries that we already rehashed */
+ if (cg->subsys[ss->subsys_id])
+ continue;
+ /* remove existing entry */
+ hash_del(&cg->hlist);
+ /* set new value */
+ cg->subsys[ss->subsys_id] = css;
+ /* recompute hash and restore entry */
+ key = css_set_hash(cg->subsys);
+ hash_add(css_set_table, &cg->hlist, key);
}
write_unlock(&css_set_lock);
@@ -4552,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cg_cgroup_link *link;
- struct hlist_head *hhead;
BUG_ON(ss->module == NULL);
@@ -4568,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
offline_css(ss, dummytop);
ss->active = 0;
- if (ss->use_id) {
- idr_remove_all(&ss->idr);
+ if (ss->use_id)
idr_destroy(&ss->idr);
- }
/* deassign the subsys_id */
subsys[ss->subsys_id] = NULL;
@@ -4586,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
write_lock(&css_set_lock);
list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
struct css_set *cg = link->cg;
+ unsigned long key;
- hlist_del(&cg->hlist);
+ hash_del(&cg->hlist);
cg->subsys[ss->subsys_id] = NULL;
- hhead = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, hhead);
+ key = css_set_hash(cg->subsys);
+ hash_add(css_set_table, &cg->hlist, key);
}
write_unlock(&css_set_lock);
@@ -4632,9 +4679,6 @@ int __init cgroup_init_early(void)
list_add(&init_css_set_link.cg_link_list,
&init_css_set.cg_links);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&css_set_table[i]);
-
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
@@ -4668,7 +4712,7 @@ int __init cgroup_init(void)
{
int err;
int i;
- struct hlist_head *hhead;
+ unsigned long key;
err = bdi_init(&cgroup_backing_dev_info);
if (err)
@@ -4687,8 +4731,8 @@ int __init cgroup_init(void)
}
/* Add init_css_set to the hash table */
- hhead = css_set_hash(init_css_set.subsys);
- hlist_add_head(&init_css_set.hlist, hhead);
+ key = css_set_hash(init_css_set.subsys);
+ hash_add(css_set_table, &init_css_set.hlist, key);
BUG_ON(!init_root_id(&rootnode));
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4983,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
task_unlock(tsk);
- if (cg)
- put_css_set_taskexit(cg);
+ put_css_set_taskexit(cg);
}
/**
@@ -5275,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
{
struct css_id *newid;
- int myid, error, size;
+ int ret, size;
BUG_ON(!ss->use_id);
@@ -5283,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
newid = kzalloc(size, GFP_KERNEL);
if (!newid)
return ERR_PTR(-ENOMEM);
- /* get id */
- if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
- error = -ENOMEM;
- goto err_out;
- }
+
+ idr_preload(GFP_KERNEL);
spin_lock(&ss->id_lock);
/* Don't use 0. allocates an ID of 1-65535 */
- error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+ ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
spin_unlock(&ss->id_lock);
+ idr_preload_end();
/* Returns error when there are no free spaces for new ID.*/
- if (error) {
- error = -ENOSPC;
+ if (ret < 0)
goto err_out;
- }
- if (myid > CSS_ID_MAX)
- goto remove_idr;
- newid->id = myid;
+ newid->id = ret;
newid->depth = depth;
return newid;
-remove_idr:
- error = -ENOSPC;
- spin_lock(&ss->id_lock);
- idr_remove(&ss->idr, myid);
- spin_unlock(&ss->id_lock);
err_out:
kfree(newid);
- return ERR_PTR(error);
+ return ERR_PTR(ret);
}
@@ -5442,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
struct inode *inode;
struct cgroup_subsys_state *css;
- inode = f->f_dentry->d_inode;
+ inode = file_inode(f);
/* check in cgroup filesystem dir */
if (inode->i_op != &cgroup_dir_inode_operations)
return ERR_PTR(-EBADF);
diff --git a/kernel/compat.c b/kernel/compat.c
index c28a306ae05c..19971d8c7299 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
__put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
}
-asmlinkage long compat_sys_getitimer(int which,
- struct compat_itimerval __user *it)
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
+ struct compat_itimerval __user *, it)
{
struct itimerval kit;
int error;
@@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
return error;
}
-asmlinkage long compat_sys_setitimer(int which,
- struct compat_itimerval __user *in,
- struct compat_itimerval __user *out)
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
+ struct compat_itimerval __user *, in,
+ struct compat_itimerval __user *, out)
{
struct itimerval kin, kout;
int error;
@@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
memcpy(blocked->sig, &set, sizeof(set));
}
-asmlinkage long compat_sys_sigprocmask(int how,
- compat_old_sigset_t __user *nset,
- compat_old_sigset_t __user *oset)
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
+ compat_old_sigset_t __user *, nset,
+ compat_old_sigset_t __user *, oset)
{
old_sigset_t old_set, new_set;
sigset_t new_blocked;
@@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
return 0;
}
-asmlinkage long
-compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
- struct compat_rusage __user *ru)
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
{
if (!ru) {
return sys_wait4(pid, stat_addr, options, NULL);
@@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
}
}
-asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
- struct compat_siginfo __user *uinfo, int options,
- struct compat_rusage __user *uru)
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, uinfo, int, options,
+ struct compat_rusage __user *, uru)
{
siginfo_t info;
struct rusage ru;
@@ -584,9 +587,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
return ret;
if (uru) {
- ret = put_compat_rusage(&ru, uru);
+ /* sys_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ ret = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ ret = put_compat_rusage(&ru, uru);
if (ret)
- return ret;
+ return -EFAULT;
}
BUG_ON(info.si_code & __SI_MASK);
@@ -964,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
}
void
-sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -975,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
}
EXPORT_SYMBOL_GPL(sigset_from_compat);
-asmlinkage long
-compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
- struct compat_siginfo __user *uinfo,
- struct compat_timespec __user *uts, compat_size_t sigsetsize)
+void
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+{
+ switch (_NSIG_WORDS) {
+ case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+ case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+ case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+ case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ }
+}
+
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
compat_sigset_t s32;
sigset_t s;
@@ -994,7 +1011,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
sigset_from_compat(&s, &s32);
if (uts) {
- if (get_compat_timespec(&t, uts))
+ if (compat_get_timespec(&t, uts))
return -EFAULT;
}
@@ -1006,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
}
return ret;
-
-}
-
-asmlinkage long
-compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
- struct compat_siginfo __user *uinfo)
-{
- siginfo_t info;
-
- if (copy_siginfo_from_user32(&info, uinfo))
- return -EFAULT;
- return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
@@ -1060,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
-asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
-{
- sigset_t newset;
- compat_sigset_t newset32;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&newset, &newset32);
- return sigsuspend(&newset);
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
-
asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
{
struct timex txc;
@@ -1215,6 +1203,22 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
return 0;
}
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
+{
+ struct timespec t;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
+ set_fs(old_fs);
+ if (put_compat_timespec(&t, interval))
+ return -EFAULT;
+ return ret;
+}
+
/*
* Allocate user-space memory for the duration of a single system call,
* in order to marshall parameters inside a compat thunk.
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e0e07fd55508..65349f07b878 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,29 +1,41 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ * Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+
#include <linux/context_tracking.h>
+#include <linux/kvm_host.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/export.h>
-struct context_tracking {
- /*
- * When active is false, hooks are not set to
- * minimize overhead: TIF flags are cleared
- * and calls to user_enter/exit are ignored. This
- * may be further optimized using static keys.
- */
- bool active;
- enum {
- IN_KERNEL = 0,
- IN_USER,
- } state;
-};
-
-static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
.active = true,
#endif
};
+/**
+ * user_enter - Inform the context tracking that the CPU is going to
+ * enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
void user_enter(void)
{
unsigned long flags;
@@ -39,40 +51,90 @@ void user_enter(void)
if (in_interrupt())
return;
+ /* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
local_irq_save(flags);
if (__this_cpu_read(context_tracking.active) &&
__this_cpu_read(context_tracking.state) != IN_USER) {
- __this_cpu_write(context_tracking.state, IN_USER);
+ /*
+ * At this stage, only low level arch entry code remains and
+ * then we'll run in userspace. We can assume there won't be
+ * any RCU read-side critical section until the next call to
+ * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * on the tick.
+ */
+ vtime_user_enter(current);
rcu_user_enter();
+ __this_cpu_write(context_tracking.state, IN_USER);
}
local_irq_restore(flags);
}
+
+/**
+ * user_exit - Inform the context tracking that the CPU is
+ * exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
void user_exit(void)
{
unsigned long flags;
- /*
- * Some contexts may involve an exception occuring in an irq,
- * leading to that nesting:
- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
- * This would mess up the dyntick_nesting count though. And rcu_irq_*()
- * helpers are enough to protect RCU uses inside the exception. So
- * just return immediately if we detect we are in an IRQ.
- */
if (in_interrupt())
return;
local_irq_save(flags);
if (__this_cpu_read(context_tracking.state) == IN_USER) {
- __this_cpu_write(context_tracking.state, IN_KERNEL);
+ /*
+ * We are going to run code that may use RCU. Inform
+ * RCU core about that (ie: we may need the tick again).
+ */
rcu_user_exit();
+ vtime_user_exit(current);
+ __this_cpu_write(context_tracking.state, IN_KERNEL);
}
local_irq_restore(flags);
}
+void guest_enter(void)
+{
+ if (vtime_accounting_enabled())
+ vtime_guest_enter(current);
+ else
+ __guest_enter();
+}
+EXPORT_SYMBOL_GPL(guest_enter);
+
+void guest_exit(void)
+{
+ if (vtime_accounting_enabled())
+ vtime_guest_exit(current);
+ else
+ __guest_exit();
+}
+EXPORT_SYMBOL_GPL(guest_exit);
+
+
+/**
+ * context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
void context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next)
{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a503242c..b5e4ab2d427e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu)
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
+ cputime_t utime, stime;
write_lock_irq(&tasklist_lock);
for_each_process(p) {
+ task_cputime(p, &utime, &stime);
if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
- (p->utime || p->stime))
+ (utime || stime))
printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
"(state = %ld, flags = %x)\n",
p->comm, task_pid_nr(p), cpu,
@@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Park the stopper thread */
+ kthread_park(current);
return 0;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb8..4f9dfe43ecbd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
#include <linux/cgroup.h>
/*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-
-/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
* short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
- struct cpuset *parent; /* my parent */
-
struct fmeter fmeter; /* memory_pressure filter */
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
/* partition number for rebuild_sched_domains() */
int pn;
/* for custom sched domain */
int relax_domain_level;
- /* used for walking a cpuset hierarchy */
- struct list_head stack_list;
+ struct work_struct hotplug_work;
};
/* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
struct cpuset, css);
}
+static inline struct cpuset *parent_cs(const struct cpuset *cs)
+{
+ struct cgroup *pcgrp = cs->css.cgroup->parent;
+
+ if (pcgrp)
+ return cgroup_cs(pcgrp);
+ return NULL;
+}
+
#ifdef CONFIG_NUMA
static inline bool task_has_mempolicy(struct task_struct *task)
{
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
/* bits in struct cpuset flags field */
typedef enum {
+ CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
-/* the type of hotplug event */
-enum hotplug_event {
- CPUSET_CPU_OFFLINE,
- CPUSET_MEM_OFFLINE,
-};
-
/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags);
+}
+
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+ (1 << CS_MEM_EXCLUSIVE)),
};
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_cgrp: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
+ cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
+ if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
+ cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+ if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
+
/*
- * There are two global mutexes guarding cpuset structures. The first
- * is the main control groups cgroup_mutex, accessed via
- * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
- * callback_mutex, below. They can nest. It is ok to first take
- * cgroup_mutex, then nest callback_mutex. We also require taking
- * task_lock() when dereferencing a task's cpuset pointer. See "The
- * task_lock() exception", at the end of this comment.
- *
- * A task must hold both mutexes to modify cpusets. If a task
- * holds cgroup_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets. It can perform various checks on
- * the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding cgroup_mutex. While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets. Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * and callback_mutex. The latter may nest inside the former. We also
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets. If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_mutex and be able to
+ * modify cpusets. It can perform various checks on the cpuset structure
+ * first, knowing nothing will change. It can also allocate memory while
+ * just holding cpuset_mutex. While it is performing these checks, various
+ * callback routines can briefly acquire callback_mutex to query cpusets.
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
+static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock);
/*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static struct workqueue_struct *cpuset_propagate_hotplug_wq;
+
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
+
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
struct cpumask *pmask)
{
while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
else
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
{
while (cs && !nodes_intersects(cs->mems_allowed,
node_states[N_MEMORY]))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
nodes_and(*pmask, cs->mems_allowed,
node_states[N_MEMORY]);
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cgroup_mutex.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
{
struct cgroup *cont;
struct cpuset *c, *par;
+ int ret;
+
+ rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
- list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
- if (!is_cpuset_subset(cgroup_cs(cont), trial))
- return -EBUSY;
- }
+ ret = -EBUSY;
+ cpuset_for_each_child(c, cont, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
/* Remaining checks don't apply to root cpuset */
+ ret = 0;
if (cur == &top_cpuset)
- return 0;
+ goto out;
- par = cur->parent;
+ par = parent_cs(cur);
/* We must be a subset of our parent cpuset */
+ ret = -EACCES;
if (!is_cpuset_subset(trial, par))
- return -EACCES;
+ goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
*/
- list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
- c = cgroup_cs(cont);
+ ret = -EINVAL;
+ cpuset_for_each_child(c, cont, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- return -EINVAL;
+ goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
- return -EINVAL;
+ goto out;
}
- /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
- if (cgroup_task_count(cur->css.cgroup)) {
- if (cpumask_empty(trial->cpus_allowed) ||
- nodes_empty(trial->mems_allowed)) {
- return -ENOSPC;
- }
- }
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
+ (cpumask_empty(trial->cpus_allowed) ||
+ nodes_empty(trial->mems_allowed)))
+ goto out;
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
#ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
-
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ struct cpuset *cp;
+ struct cgroup *pos_cgrp;
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
}
+ rcu_read_unlock();
}
/*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup *pos_cgrp;
doms = NULL;
dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
- continue;
-
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
/*
* Rebuild scheduler domains.
*
- * Call with neither cgroup_mutex held nor within get_online_cpus().
- * Takes both cgroup_mutex and get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
*
- * Cannot be directly called from cpuset code handling changes
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
+ * Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
/* Generate domain masks and attrs */
- cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
put_online_cpus();
}
#else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
}
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
}
#endif /* CONFIG_SMP */
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-
-/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
- *
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
- */
-static void async_rebuild_sched_domains(void)
-{
- queue_work(cpuset_wq, &rebuild_sched_domains_work);
-}
-
-/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
- *
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
void rebuild_sched_domains(void)
{
- do_rebuild_sched_domains(NULL);
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
}
/**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
* @tsk: task to test
* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Called for each task in a cgroup by cgroup_scan_tasks().
* Return nonzero if this tasks's cpus_allowed mask should be changed (in other
* words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
* cpus_allowed mask needs to be changed.
*
* We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
*/
static void cpuset_change_cpumask(struct task_struct *tsk,
struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
heap_free(&heap);
if (is_load_balanced)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
return 0;
}
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding cgroup_mutex, so current's cpuset won't change
+ * Call holding cpuset_mutex, so current's cpuset won't change
* during this call, as manage_mutex holds off any cpuset_attach()
* calls. Therefore we don't need to take task_lock around the
* call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
/*
* Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
* of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
+ * memory_migrate flag is set. Called with cpuset_mutex held.
*/
static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- static nodemask_t newmems; /* protected by cgroup_mutex */
+ static nodemask_t newmems; /* protected by cpuset_mutex */
cs = cgroup_cs(scan->cg);
guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
* @oldmem: old mems_allowed of cpuset cs
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
* if @heap != NULL.
*/
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cgroup_mutex, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
}
return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* Called by cgroup_scan_tasks() for each task in a cgroup.
*
* We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
*/
static void cpuset_change_flag(struct task_struct *tsk,
struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
* @cs: the cpuset in which each task's spread flags needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
if (spread_flag_changed)
update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/*
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
-
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
struct cpuset *cs = cgroup_cs(cgrp);
struct task_struct *task;
int ret;
+ mutex_lock(&cpuset_mutex);
+
+ ret = -ENOSPC;
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
+ goto out_unlock;
cgroup_taskset_for_each(task, cgrp, tset) {
/*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
* set_cpus_allowed_ptr() on all attached tasks before
* cpus_allowed may be changed.
*/
+ ret = -EINVAL;
if (task->flags & PF_THREAD_BOUND)
- return -EINVAL;
- if ((ret = security_task_setscheduler(task)))
- return ret;
+ goto out_unlock;
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
}
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
- guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
- return 0;
+static void cpuset_cancel_attach(struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
+{
+ mutex_lock(&cpuset_mutex);
+ cgroup_cs(cgrp)->attach_in_progress--;
+ mutex_unlock(&cpuset_mutex);
}
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
+ /* static bufs protected by cpuset_mutex */
+ static nodemask_t cpuset_attach_nodemask_from;
+ static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm;
struct task_struct *task;
struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *oldcs = cgroup_cs(oldcgrp);
+ mutex_lock(&cpuset_mutex);
+
+ /* prepare for attach */
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+
cgroup_taskset_for_each(task, cgrp, tset) {
/*
* can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
&cpuset_attach_nodemask_to);
mmput(mm);
}
+
+ cs->attach_in_progress--;
+
+ /*
+ * We may have raced with CPU/memory hotunplug. Trigger hotplug
+ * propagation if @cs doesn't have any CPU or memory. It will move
+ * the newly added tasks to the nearest parent which can execute.
+ */
+ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ schedule_cpuset_propagate_hotplug(cs);
+
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
const char *buf)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *trialcs;
+ int retval = -ENODEV;
+
+ /*
+ * CPU or memory hotunplug may leave @cs w/o any execution
+ * resources, in which case the hotplug code asynchronously updates
+ * configuration and transfers all tasks to the nearest ancestor
+ * which can execute.
+ *
+ * As writes to "cpus" or "mems" may restore @cs's execution
+ * resources, wait for the previously scheduled operations before
+ * proceeding, so that we don't end up keep removing tasks added
+ * after execution capability is restored.
+ *
+ * Flushing cpuset_hotplug_work is enough to synchronize against
+ * hotplug hanlding; however, cpuset_attach() may schedule
+ * propagation work directly. Flush the workqueue too.
+ */
+ flush_work(&cpuset_hotplug_work);
+ flush_workqueue(cpuset_propagate_hotplug_wq);
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
- goto out;
+ goto out_unlock;
}
switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
-out:
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
{
- struct cgroup *parent_cg = cont->parent;
- struct cgroup *tmp_cg;
- struct cpuset *parent, *cs;
+ struct cpuset *cs;
- if (!parent_cg)
+ if (!cont->parent)
return &top_cpuset.css;
- parent = cgroup_cs(parent_cg);
- cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
return ERR_PTR(-ENOMEM);
}
- cs->flags = 0;
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
fmeter_init(&cs->fmeter);
+ INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
cs->relax_domain_level = -1;
- cs->parent = parent;
+ return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup *cgrp)
+{
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup *pos_cg;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&cpuset_mutex);
+
+ set_bit(CS_ONLINE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
number_of_cpusets++;
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
- goto skip_clone;
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
+ goto out_unlock;
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
* (and likewise for mems) to the new cgroup.
*/
- list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
- struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
-
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
- goto skip_clone;
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_cg, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
}
+ rcu_read_unlock();
mutex_lock(&callback_mutex);
cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
mutex_unlock(&callback_mutex);
-skip_clone:
- return &cs->css;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return 0;
+}
+
+static void cpuset_css_offline(struct cgroup *cgrp)
+{
+ struct cpuset *cs = cgroup_cs(cgrp);
+
+ mutex_lock(&cpuset_mutex);
+
+ if (is_sched_load_balance(cs))
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+ number_of_cpusets--;
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
}
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
*/
static void cpuset_css_free(struct cgroup *cont)
{
struct cpuset *cs = cgroup_cs(cont);
- if (is_sched_load_balance(cs))
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-
- number_of_cpusets--;
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
struct cgroup_subsys cpuset_subsys = {
.name = "cpuset",
.css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
.css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
.subsys_id = cpuset_subsys_id,
.base_cftypes = files,
@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
{
struct cgroup *new_cgroup = scan->data;
+ cgroup_lock();
cgroup_attach_task(new_cgroup, tsk);
+ cgroup_unlock();
}
/**
@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
* @from: cpuset in which the tasks currently reside
* @to: cpuset to which the tasks will be moved
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
* callback_mutex must not be held, as cpuset_attach() will take it.
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
* cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
*/
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
struct cpuset *parent;
/*
- * The cgroup's css_sets list is in use if there are tasks
- * in the cpuset; the list is empty if there are none;
- * the cs->css.refcnt seems always 0.
- */
- if (list_empty(&cs->css.cgroup->css_sets))
- return;
-
- /*
* Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty).
*/
- parent = cs->parent;
+ parent = parent_cs(cs);
while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed))
- parent = parent->parent;
+ parent = parent_cs(parent);
move_member_tasks_to_cpuset(cs, parent);
}
-/*
- * Helper function to traverse cpusets.
- * It can be used to walk the cpuset tree from top to bottom, completing
- * one layer before dropping down to the next (thus always processing a
- * node before any of its children).
+/**
+ * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
+ * @cs: cpuset in interest
+ *
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
*/
-static struct cpuset *cpuset_next(struct list_head *queue)
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
{
- struct cpuset *cp;
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
+ static cpumask_t off_cpus;
+ static nodemask_t off_mems, tmp_mems;
+ struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
+ bool is_empty;
- if (list_empty(queue))
- return NULL;
+ mutex_lock(&cpuset_mutex);
+
+ cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+ nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
- cp = list_first_entry(queue, struct cpuset, stack_list);
- list_del(queue->next);
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, queue);
+ /* remove offline cpus from @cs */
+ if (!cpumask_empty(&off_cpus)) {
+ mutex_lock(&callback_mutex);
+ cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+ mutex_unlock(&callback_mutex);
+ update_tasks_cpumask(cs, NULL);
+ }
+
+ /* remove offline mems from @cs */
+ if (!nodes_empty(off_mems)) {
+ tmp_mems = cs->mems_allowed;
+ mutex_lock(&callback_mutex);
+ nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+ mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(cs, &tmp_mems, NULL);
}
- return cp;
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * If @cs became empty, move tasks to the nearest ancestor with
+ * execution resources. This is full cgroup operation which will
+ * also call back into cpuset. Should be done outside any lock.
+ */
+ if (is_empty)
+ remove_tasks_in_empty_cpuset(cs);
+
+ /* the following may free @cs, should be the last operation */
+ css_put(&cs->css);
}
+/**
+ * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
+ * @cs: cpuset of interest
+ *
+ * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
+ * memory masks according to top_cpuset.
+ */
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
+{
+ /*
+ * Pin @cs. The refcnt will be released when the work item
+ * finishes executing.
+ */
+ if (!css_tryget(&cs->css))
+ return;
-/*
- * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
- * online/offline) and update the cpusets accordingly.
- * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
- * cpuset must be moved to a parent cpuset.
+ /*
+ * Queue @cs->hotplug_work. If already pending, lose the css ref.
+ * cpuset_propagate_hotplug_wq is ordered and propagation will
+ * happen in the order this function is called.
+ */
+ if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
+ css_put(&cs->css);
+}
+
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
- * Called with cgroup_mutex held. We take callback_mutex to modify
- * cpus_allowed and mems_allowed.
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly. The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
*
- * This walk processes the tree from top to bottom, completing one layer
- * before dropping down to the next. It always processes a node before
- * any of its children.
+ * Non-root cpusets are only affected by offlining. If any CPUs or memory
+ * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
+ * descendants.
*
- * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
- * if all present pages from a node are offlined.
+ * Note that CPU offlining during suspend is ignored. We don't modify
+ * cpusets across suspend/resume cycles at all.
*/
-static void
-scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
+static void cpuset_hotplug_workfn(struct work_struct *work)
{
- LIST_HEAD(queue);
- struct cpuset *cp; /* scans cpusets being updated */
- static nodemask_t oldmems; /* protected by cgroup_mutex */
+ static cpumask_t new_cpus, tmp_cpus;
+ static nodemask_t new_mems, tmp_mems;
+ bool cpus_updated, mems_updated;
+ bool cpus_offlined, mems_offlined;
- list_add_tail((struct list_head *)&root->stack_list, &queue);
+ mutex_lock(&cpuset_mutex);
- switch (event) {
- case CPUSET_CPU_OFFLINE:
- while ((cp = cpuset_next(&queue)) != NULL) {
+ /* fetch the available cpus/mems and find out which changed how */
+ cpumask_copy(&new_cpus, cpu_active_mask);
+ new_mems = node_states[N_MEMORY];
- /* Continue past cpusets with all cpus online */
- if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
- continue;
+ cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
+ cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
+ &new_cpus);
- /* Remove offline cpus from this cpuset. */
- mutex_lock(&callback_mutex);
- cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_active_mask);
- mutex_unlock(&callback_mutex);
+ mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+ nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
+ mems_offlined = !nodes_empty(tmp_mems);
- /* Move tasks from the empty cpuset to a parent */
- if (cpumask_empty(cp->cpus_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else
- update_tasks_cpumask(cp, NULL);
- }
- break;
+ /* synchronize cpus_allowed to cpu_active_mask */
+ if (cpus_updated) {
+ mutex_lock(&callback_mutex);
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ mutex_unlock(&callback_mutex);
+ /* we don't mess with cpumasks of tasks in top_cpuset */
+ }
- case CPUSET_MEM_OFFLINE:
- while ((cp = cpuset_next(&queue)) != NULL) {
+ /* synchronize mems_allowed to N_MEMORY */
+ if (mems_updated) {
+ tmp_mems = top_cpuset.mems_allowed;
+ mutex_lock(&callback_mutex);
+ top_cpuset.mems_allowed = new_mems;
+ mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
+ }
- /* Continue past cpusets with all mems online */
- if (nodes_subset(cp->mems_allowed,
- node_states[N_MEMORY]))
- continue;
+ /* if cpus or mems went down, we need to propagate to descendants */
+ if (cpus_offlined || mems_offlined) {
+ struct cpuset *cs;
+ struct cgroup *pos_cgrp;
- oldmems = cp->mems_allowed;
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
+ schedule_cpuset_propagate_hotplug(cs);
+ rcu_read_unlock();
+ }
- /* Remove offline mems from this cpuset. */
- mutex_lock(&callback_mutex);
- nodes_and(cp->mems_allowed, cp->mems_allowed,
- node_states[N_MEMORY]);
- mutex_unlock(&callback_mutex);
+ mutex_unlock(&cpuset_mutex);
- /* Move tasks from the empty cpuset to a parent */
- if (nodes_empty(cp->mems_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else
- update_tasks_nodemask(cp, &oldmems, NULL);
- }
+ /* wait for propagations to finish */
+ flush_workqueue(cpuset_propagate_hotplug_wq);
+
+ /* rebuild sched domains if cpus_allowed has changed */
+ if (cpus_updated) {
+ struct sched_domain_attr *attr;
+ cpumask_var_t *doms;
+ int ndoms;
+
+ mutex_lock(&cpuset_mutex);
+ ndoms = generate_sched_domains(&doms, &attr);
+ mutex_unlock(&cpuset_mutex);
+
+ partition_sched_domains(ndoms, doms, attr);
}
}
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period. This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * The only exception to this is suspend/resume, where we don't
- * modify cpusets at all.
- *
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_active_mask on each CPU hotplug (cpuhp) event.
- *
- * Called within get_online_cpus(). Needs to call cgroup_lock()
- * before calling generate_sched_domains().
- *
- * @cpu_online: Indicates whether this is a CPU online event (true) or
- * a CPU offline event (false).
- */
void cpuset_update_active_cpus(bool cpu_online)
{
- struct sched_domain_attr *attr;
- cpumask_var_t *doms;
- int ndoms;
-
- cgroup_lock();
- mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- mutex_unlock(&callback_mutex);
-
- if (!cpu_online)
- scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
-
- ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
-
- /* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
+ /*
+ * We're inside cpu hotplug critical region which usually nests
+ * inside cgroup synchronization. Bounce actual hotplug processing
+ * to a work item to avoid reverse locking order.
+ *
+ * We still need to do partition_sched_domains() synchronously;
+ * otherwise, the scheduler will get confused and put tasks to the
+ * dead CPU. Fall back to the default single domain.
+ * cpuset_hotplug_workfn() will rebuild it as necessary.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ schedule_work(&cpuset_hotplug_work);
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- static nodemask_t oldmems; /* protected by cgroup_mutex */
-
- cgroup_lock();
- switch (action) {
- case MEM_ONLINE:
- oldmems = top_cpuset.mems_allowed;
- mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = node_states[N_MEMORY];
- mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
- break;
- case MEM_OFFLINE:
- /*
- * needn't update top_cpuset.mems_allowed explicitly because
- * scan_cpusets_upon_hotplug() will update it.
- */
- scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
- break;
- default:
- break;
- }
- cgroup_unlock();
-
+ schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
#endif
@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
- cpuset_wq = create_singlethread_workqueue("cpuset");
- BUG_ON(!cpuset_wq);
+ cpuset_propagate_hotplug_wq =
+ alloc_ordered_workqueue("cpuset_hotplug", 0);
+ BUG_ON(!cpuset_propagate_hotplug_wq);
}
/**
@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
*/
static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
{
- while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
- cs = cs->parent;
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
return cs;
}
@@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
}
/**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
- mutex_unlock(&callback_mutex);
-}
-
-/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
*
@@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
dentry = task_cs(tsk)->css.cgroup->dentry;
spin_lock(&cpuset_buffer_lock);
- snprintf(cpuset_name, CPUSET_NAME_LEN,
- dentry ? (const char *)dentry->d_name.name : "/");
+
+ if (!dentry) {
+ strcpy(cpuset_name, "/");
+ } else {
+ spin_lock(&dentry->d_lock);
+ strlcpy(cpuset_name, (const char *)dentry->d_name.name,
+ CPUSET_NAME_LEN);
+ spin_unlock(&dentry->d_lock);
+ }
+
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk)
goto out_free;
- retval = -EINVAL;
- cgroup_lock();
+ rcu_read_lock();
css = task_subsys_state(tsk, cpuset_subsys_id);
retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+ rcu_read_unlock();
if (retval < 0)
- goto out_unlock;
+ goto out_put_task;
seq_puts(m, buf);
seq_putc(m, '\n');
-out_unlock:
- cgroup_unlock();
+out_put_task:
put_task_struct(tsk);
out_free:
kfree(buf);
diff --git a/kernel/cred.c b/kernel/cred.c
index 48cea3da6d05..e0573a43c7df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -30,17 +30,6 @@
static struct kmem_cache *cred_jar;
/*
- * The common credentials for the initial task's thread group
- */
-#ifdef CONFIG_KEYS
-static struct thread_group_cred init_tgcred = {
- .usage = ATOMIC_INIT(2),
- .tgid = 0,
- .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
-};
-#endif
-
-/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
@@ -65,9 +54,6 @@ struct cred init_cred = {
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
-#ifdef CONFIG_KEYS
- .tgcred = &init_tgcred,
-#endif
};
static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
}
/*
- * Dispose of the shared task group credentials
- */
-#ifdef CONFIG_KEYS
-static void release_tgcred_rcu(struct rcu_head *rcu)
-{
- struct thread_group_cred *tgcred =
- container_of(rcu, struct thread_group_cred, rcu);
-
- BUG_ON(atomic_read(&tgcred->usage) != 0);
-
- key_put(tgcred->session_keyring);
- key_put(tgcred->process_keyring);
- kfree(tgcred);
-}
-#endif
-
-/*
- * Release a set of thread group credentials.
- */
-static void release_tgcred(struct cred *cred)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = cred->tgcred;
-
- if (atomic_dec_and_test(&tgcred->usage))
- call_rcu(&tgcred->rcu, release_tgcred_rcu);
-#endif
-}
-
-/*
* The RCU callback to actually dispose of a set of credentials
*/
static void put_cred_rcu(struct rcu_head *rcu)
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
#endif
security_cred_free(cred);
+ key_put(cred->session_keyring);
+ key_put(cred->process_keyring);
key_put(cred->thread_keyring);
key_put(cred->request_key_auth);
- release_tgcred(cred);
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
-#ifdef CONFIG_KEYS
- new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
- if (!new->tgcred) {
- kmem_cache_free(cred_jar, new);
- return NULL;
- }
- atomic_set(&new->tgcred->usage, 1);
-#endif
-
atomic_set(&new->usage, 1);
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void)
get_user_ns(new->user_ns);
#ifdef CONFIG_KEYS
+ key_get(new->session_keyring);
+ key_get(new->process_keyring);
key_get(new->thread_keyring);
key_get(new->request_key_auth);
- atomic_inc(&new->tgcred->usage);
#endif
#ifdef CONFIG_SECURITY
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds);
*/
struct cred *prepare_exec_creds(void)
{
- struct thread_group_cred *tgcred = NULL;
struct cred *new;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred)
- return NULL;
-#endif
-
new = prepare_creds();
- if (!new) {
- kfree(tgcred);
+ if (!new)
return new;
- }
#ifdef CONFIG_KEYS
/* newly exec'd tasks don't get a thread keyring */
key_put(new->thread_keyring);
new->thread_keyring = NULL;
- /* create a new per-thread-group creds for all this set of threads to
- * share */
- memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
-
/* inherit the session keyring; new process keyring */
- key_get(tgcred->session_keyring);
- tgcred->process_keyring = NULL;
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
#endif
return new;
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void)
*/
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
struct cred *new;
int ret;
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
install_thread_keyring_to_cred(new);
}
- /* we share the process and session keyrings between all the threads in
- * a process - this is slightly icky as we violate COW credentials a
- * bit */
+ /* The process keyring is only shared between the threads in a process;
+ * anything outside of those threads doesn't inherit.
+ */
if (!(clone_flags & CLONE_THREAD)) {
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- ret = -ENOMEM;
- goto error_put;
- }
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = key_get(new->tgcred->session_keyring);
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
}
#endif
@@ -455,6 +372,31 @@ error_put:
return ret;
}
+static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
+{
+ const struct user_namespace *set_ns = set->user_ns;
+ const struct user_namespace *subset_ns = subset->user_ns;
+
+ /* If the two credentials are in the same user namespace see if
+ * the capabilities of subset are a subset of set.
+ */
+ if (set_ns == subset_ns)
+ return cap_issubset(subset->cap_permitted, set->cap_permitted);
+
+ /* The credentials are in a different user namespaces
+ * therefore one is a subset of the other only if a set is an
+ * ancestor of subset and set->euid is owner of subset or one
+ * of subsets ancestors.
+ */
+ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
+ if ((set_ns == subset_ns->parent) &&
+ uid_eq(subset_ns->owner, set->euid))
+ return true;
+ }
+
+ return false;
+}
+
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
@@ -493,7 +435,7 @@ int commit_creds(struct cred *new)
!gid_eq(old->egid, new->egid) ||
!uid_eq(old->fsuid, new->fsuid) ||
!gid_eq(old->fsgid, new->fsgid) ||
- !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+ !cred_cap_issubset(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
@@ -643,9 +585,6 @@ void __init cred_init(void)
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
const struct cred *old;
struct cred *new;
@@ -653,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (!new)
return NULL;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- kmem_cache_free(cred_jar, new);
- return NULL;
- }
-#endif
-
kdebug("prepare_kernel_cred() alloc %p", new);
if (daemon)
@@ -678,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = NULL;
- new->tgcred = tgcred;
- new->request_key_auth = NULL;
+ new->session_keyring = NULL;
+ new->process_keyring = NULL;
new->thread_keyring = NULL;
+ new->request_key_auth = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc8..c26278fd4851 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
*/
#include <linux/pid_namespace.h>
#include <linux/clocksource.h>
+#include <linux/serial_core.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/console.h>
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28a7e7a..2235967e78b0 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -72,6 +72,8 @@ extern int dbg_kdb_mode;
#ifdef CONFIG_KGDB_KDB
extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e064482..19d9a578c753 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
#include <linux/kernel.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/serial_core.h>
#include <linux/reboot.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
len = len / 2;
remcom_out_buffer[len++] = 0;
+ kdb_common_init_state(ks);
kdb_parse(remcom_out_buffer);
+ kdb_common_deinit_state();
+
strcpy(remcom_out_buffer, "OK");
}
break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f8ec5d..70a504601dc3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
/*
* kdb_ss
*
- * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
- * commands.
+ * Process the 'ss' (Single Step) command.
*
* ss
- * ssb
*
* Parameters:
* argc Argument count
@@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
* Outputs:
* None.
* Returns:
- * KDB_CMD_SS[B] for success, a kdb error if failure.
+ * KDB_CMD_SS for success, a kdb error if failure.
* Locking:
* None.
* Remarks:
*
* Set the arch specific option to trigger a debug trap after the next
* instruction.
- *
- * For 'ssb', set the trace flag in the debug trap handler
- * after printing the current insn and return directly without
- * invoking the kdb command processor, until a branch instruction
- * is encountered.
*/
static int kdb_ss(int argc, const char **argv)
{
- int ssb = 0;
-
- ssb = (strcmp(argv[0], "ssb") == 0);
if (argc != 0)
return KDB_ARGCOUNT;
/*
* Set trace flag and go.
*/
KDB_STATE_SET(DOING_SS);
- if (ssb) {
- KDB_STATE_SET(DOING_SSB);
- return KDB_CMD_SSB;
- }
return KDB_CMD_SS;
}
@@ -561,8 +547,6 @@ void __init kdb_initbptab(void)
kdb_register_repeat("ss", kdb_ss, "",
"Single Step", 1, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("ssb", kdb_ss, "",
- "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
/*
* Architecture dependent initialization.
*/
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b73d30..328d18ef31e4 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
static struct kgdb_state *kdb_ks;
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+ kdb_initial_cpu = atomic_read(&kgdb_active);
+ kdb_current_task = kgdb_info[ks->cpu].task;
+ kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ return 0;
+}
+
+int kdb_common_deinit_state(void)
+{
+ kdb_initial_cpu = -1;
+ kdb_current_task = NULL;
+ kdb_current_regs = NULL;
+ return 0;
+}
+
int kdb_stub(struct kgdb_state *ks)
{
int error = 0;
@@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks)
}
/* Set initial kdb state variables */
KDB_STATE_CLEAR(KGDB_TRANS);
- kdb_initial_cpu = atomic_read(&kgdb_active);
- kdb_current_task = kgdb_info[ks->cpu].task;
- kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ kdb_common_init_state(ks);
/* Remove any breakpoints as needed by kdb and clear single step */
kdb_bp_remove();
KDB_STATE_CLEAR(DOING_SS);
- KDB_STATE_CLEAR(DOING_SSB);
KDB_STATE_SET(PAGER);
/* zero out any offline cpu data */
for_each_present_cpu(i) {
@@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks)
* Upon exit from the kdb main loop setup break points and restart
* the system based on the requested continue state
*/
- kdb_initial_cpu = -1;
- kdb_current_task = NULL;
- kdb_current_regs = NULL;
+ kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4d5f8d5612f3..00eb8f7fbf41 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
};
#undef KDBMSG
-static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
/*
@@ -175,7 +175,7 @@ static char *__env[] = {
(char *)0,
};
-static const int __nenv = (sizeof(__env) / sizeof(char *));
+static const int __nenv = ARRAY_SIZE(__env);
struct task_struct *kdb_curr_task(int cpu)
{
@@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
}
if (argc != 3)
return KDB_ARGCOUNT;
- defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set) {
- kdb_printf("Could not allocate new defcmd_set entry for %s\n",
- argv[1]);
- defcmd_set = save_defcmd_set;
+ if (in_dbg_master()) {
+ kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
+ defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+ GFP_KDB);
+ if (!defcmd_set)
+ goto fail_defcmd;
memcpy(defcmd_set, save_defcmd_set,
defcmd_set_count * sizeof(*defcmd_set));
- kfree(save_defcmd_set);
s = defcmd_set + defcmd_set_count;
memset(s, 0, sizeof(*s));
s->usable = 1;
s->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!s->name)
+ goto fail_name;
s->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!s->usage)
+ goto fail_usage;
s->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!s->help)
+ goto fail_help;
if (s->usage[0] == '"') {
- strcpy(s->usage, s->usage+1);
+ strcpy(s->usage, argv[2]+1);
s->usage[strlen(s->usage)-1] = '\0';
}
if (s->help[0] == '"') {
- strcpy(s->help, s->help+1);
+ strcpy(s->help, argv[3]+1);
s->help[strlen(s->help)-1] = '\0';
}
++defcmd_set_count;
defcmd_in_progress = 1;
+ kfree(save_defcmd_set);
return 0;
+fail_help:
+ kfree(s->usage);
+fail_usage:
+ kfree(s->name);
+fail_name:
+ kfree(defcmd_set);
+fail_defcmd:
+ kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+ defcmd_set = save_defcmd_set;
+ return KDB_NOTIMP;
}
/*
@@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
* KDB_CMD_GO User typed 'go'.
* KDB_CMD_CPU User switched to another cpu.
* KDB_CMD_SS Single step.
- * KDB_CMD_SSB Single step until branch.
*/
static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_dbtrap_t db_result)
@@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
instruction_pointer(regs));
break;
- case KDB_DB_SSB:
- /*
- * In the midst of ssb command. Just return.
- */
- KDB_DEBUG_STATE("kdb_local 3", reason);
- return KDB_CMD_SSB; /* Continue with SSB command */
-
- break;
case KDB_DB_SS:
break;
case KDB_DB_SSBPT:
@@ -1281,7 +1288,6 @@ do_full_getstr:
if (diag == KDB_CMD_GO
|| diag == KDB_CMD_CPU
|| diag == KDB_CMD_SS
- || diag == KDB_CMD_SSB
|| diag == KDB_CMD_KGDB)
break;
@@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
break;
}
- if (result == KDB_CMD_SSB) {
- KDB_STATE_SET(DOING_SS);
- KDB_STATE_SET(DOING_SSB);
- break;
- }
-
if (result == KDB_CMD_KGDB) {
if (!KDB_STATE(DOING_KGDB))
kdb_printf("Entering please attach debugger "
@@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf("Module Size modstruct Used by\n");
list_for_each_entry(mod, kdb_modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
kdb_printf("%-20s%8u 0x%p ", mod->name,
mod->core_size, (void *)mod);
@@ -2348,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv)
return 0;
}
-/*
- * kdb_ll - This function implements the 'll' command which follows a
- * linked list and executes an arbitrary command for each
- * element.
- */
-static int kdb_ll(int argc, const char **argv)
-{
- int diag = 0;
- unsigned long addr;
- long offset = 0;
- unsigned long va;
- unsigned long linkoffset;
- int nextarg;
- const char *command;
-
- if (argc != 3)
- return KDB_ARGCOUNT;
-
- nextarg = 1;
- diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
- if (diag)
- return diag;
-
- diag = kdbgetularg(argv[2], &linkoffset);
- if (diag)
- return diag;
-
- /*
- * Using the starting address as
- * the first element in the list, and assuming that
- * the list ends with a null pointer.
- */
-
- va = addr;
- command = kdb_strdup(argv[3], GFP_KDB);
- if (!command) {
- kdb_printf("%s: cannot duplicate command\n", __func__);
- return 0;
- }
- /* Recursive use of kdb_parse, do not use argv after this point */
- argv = NULL;
-
- while (va) {
- char buf[80];
-
- if (KDB_FLAG(CMD_INTERRUPT))
- goto out;
-
- sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
- diag = kdb_parse(buf);
- if (diag)
- goto out;
-
- addr = va + linkoffset;
- if (kdb_getword(&va, addr, sizeof(va)))
- goto out;
- }
-
-out:
- kfree(command);
- return diag;
-}
-
static int kdb_kgdb(int argc, const char **argv)
{
return KDB_CMD_KGDB;
@@ -2428,11 +2367,15 @@ static int kdb_help(int argc, const char **argv)
kdb_printf("-----------------------------"
"-----------------------------\n");
for_each_kdbcmd(kt, i) {
- if (kt->cmd_name)
- kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
- kt->cmd_usage, kt->cmd_help);
+ char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
+ if (!kt->cmd_name)
+ continue;
+ if (strlen(kt->cmd_usage) > 20)
+ space = "\n ";
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+ kt->cmd_usage, space, kt->cmd_help);
}
return 0;
}
@@ -2737,7 +2680,7 @@ int kdb_register_repeat(char *cmd,
(kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
kfree(kdb_commands);
}
- memset(new + kdb_max_commands, 0,
+ memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
kdb_command_extend * sizeof(*new));
kdb_commands = new;
kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2841,15 +2784,13 @@ static void __init kdb_inittab(void)
"Stack traceback", 1, KDB_REPEAT_NONE);
kdb_register_repeat("btp", kdb_bt, "<pid>",
"Display stack for process <pid>", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
- "Display stack all processes", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btc", kdb_bt, "",
"Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btt", kdb_bt, "<vaddr>",
"Backtrace process given its struct task address", 0,
KDB_REPEAT_NONE);
- kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
- "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
kdb_register_repeat("env", kdb_env, "",
"Show environment variables", 0, KDB_REPEAT_NONE);
kdb_register_repeat("set", kdb_set, "",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a25844..7afd3c8c41d5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
#define KDB_CMD_GO (-1001)
#define KDB_CMD_CPU (-1002)
#define KDB_CMD_SS (-1003)
-#define KDB_CMD_SSB (-1004)
#define KDB_CMD_KGDB (-1005)
/* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
* kdb control */
#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
-#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
- * DOING_SS is also set */
#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
* after one ss, independent of
* DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
typedef enum {
KDB_DB_BPT, /* Breakpoint */
KDB_DB_SS, /* Single-step trap */
- KDB_DB_SSB, /* Single step to branch */
KDB_DB_SSBPT, /* Single step over breakpoint */
KDB_DB_NOBPT /* Spurious breakpoint */
} kdb_dbtrap_t;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053aa..d473988c1d0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
unsigned long long t2, t3;
unsigned long flags;
struct timespec ts;
+ cputime_t utime, stime, stimescaled, utimescaled;
/* Though tsk->delays accessed later, early exit avoids
* unnecessary returning of other data
@@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
goto done;
tmp = (s64)d->cpu_run_real_total;
- cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ task_cputime(tsk, &utime, &stime);
+ cputime_to_timespec(utime + stime, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
tmp = (s64)d->cpu_scaled_run_real_total;
- cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ cputime_to_timespec(utimescaled + stimescaled, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f9ff5493171d..b0cd86501c30 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -908,6 +908,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
}
/*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+ event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+ PERF_EVENT_STATE_INACTIVE;
+}
+
+/*
* Called at perf_event creation and when events are attached/detached from a
* group.
*/
@@ -3682,7 +3691,7 @@ unlock:
static int perf_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct perf_event *event = filp->private_data;
int retval;
@@ -5117,7 +5126,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
{
struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
struct perf_event *event;
- struct hlist_node *node;
struct hlist_head *head;
rcu_read_lock();
@@ -5125,7 +5133,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
if (!head)
goto end;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_event(event, nr, data, regs);
}
@@ -5410,7 +5418,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
{
struct perf_sample_data data;
struct perf_event *event;
- struct hlist_node *node;
struct perf_raw_record raw = {
.size = entry_size,
@@ -5420,7 +5427,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
perf_sample_data_init(&data, addr, 0);
data.raw = &raw;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
@@ -5956,13 +5963,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
pmu->name = name;
if (type < 0) {
- int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
- if (!err)
- goto free_pdc;
-
- err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
- if (err) {
- ret = err;
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
goto free_pdc;
}
}
@@ -6155,18 +6158,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->parent = parent_event;
- event->ns = get_pid_ns(current->nsproxy->pid_ns);
+ event->ns = get_pid_ns(task_active_pid_ns(current));
event->id = atomic64_inc_return(&perf_event_id);
event->state = PERF_EVENT_STATE_INACTIVE;
if (task) {
event->attach_state = PERF_ATTACH_TASK;
+
+ if (attr->type == PERF_TYPE_TRACEPOINT)
+ event->hw.tp_target = task;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
* hw_breakpoint is a bit difficult here..
*/
- if (attr->type == PERF_TYPE_BREAKPOINT)
+ else if (attr->type == PERF_TYPE_BREAKPOINT)
event->hw.bp_target = task;
#endif
}
@@ -6179,8 +6185,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->overflow_handler = overflow_handler;
event->overflow_handler_context = context;
- if (attr->disabled)
- event->state = PERF_EVENT_STATE_OFF;
+ perf_event__state_init(event);
pmu = NULL;
@@ -6609,9 +6614,17 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_lock(&gctx->mutex);
perf_remove_from_context(group_leader);
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_remove_from_context(sibling);
+ perf_event__state_init(sibling);
put_ctx(gctx);
}
mutex_unlock(&gctx->mutex);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916507ed..a64f8aeb5c1f 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void)
err_alloc:
for_each_possible_cpu(err_cpu) {
for (i = 0; i < TYPE_MAX; i++)
- kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+ kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
if (err_cpu == cpu)
break;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dea7acfbb071..a567c8c7ef31 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
#include <linux/swap.h> /* try_to_free_swap */
@@ -41,58 +42,31 @@
#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
static struct rb_root uprobes_tree = RB_ROOT;
-
-static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
-
-#define UPROBES_HASH_SZ 13
-
/*
- * We need separate register/unregister and mmap/munmap lock hashes because
- * of mmap_sem nesting.
- *
- * uprobe_register() needs to install probes on (potentially) all processes
- * and thus needs to acquire multiple mmap_sems (consequtively, not
- * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
- * for the particular process doing the mmap.
- *
- * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
- * because of lock order against i_mmap_mutex. This means there's a hole in
- * the register vma iteration where a mmap() can happen.
- *
- * Thus uprobe_register() can race with uprobe_mmap() and we can try and
- * install a probe where one is already installed.
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time. Probably a fine grained per inode count is better?
*/
+#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
-/* serialize (un)register */
-static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-
-#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
static struct percpu_rw_semaphore dup_mmap_sem;
-/*
- * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
- * events active at this time. Probably a fine grained per inode count is
- * better?
- */
-static atomic_t uprobe_events = ATOMIC_INIT(0);
-
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
-/* Dont run handlers when first register/ last unregister in progress*/
-#define UPROBE_RUN_HANDLER 1
/* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP 2
+#define UPROBE_SKIP_SSTEP 1
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
atomic_t ref;
+ struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
- struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct inode *inode; /* Also hold a ref to inode */
@@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
u = __insert_uprobe(uprobe);
spin_unlock(&uprobes_treelock);
- /* For now assume that the instruction need not be single-stepped */
- __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
-
return u;
}
@@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->inode = igrab(inode);
uprobe->offset = offset;
+ init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
- mutex_init(&uprobe->copy_mutex);
+ /* For now assume that the instruction need not be single-stepped */
+ __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
kfree(uprobe);
uprobe = cur_uprobe;
iput(inode);
- } else {
- atomic_inc(&uprobe_events);
}
return uprobe;
}
-static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
-{
- struct uprobe_consumer *uc;
-
- if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
- return;
-
- down_read(&uprobe->consumer_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- if (!uc->filter || uc->filter(uc, current))
- uc->handler(uc, regs);
- }
- up_read(&uprobe->consumer_rwsem);
-}
-
-/* Returns the previous consumer */
-static struct uprobe_consumer *
-consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
uc->next = uprobe->consumers;
uprobe->consumers = uc;
up_write(&uprobe->consumer_rwsem);
-
- return uc->next;
}
/*
@@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
return ret;
- mutex_lock(&uprobe->copy_mutex);
+ /* TODO: move this into _register, until then we abuse this sem. */
+ down_write(&uprobe->consumer_rwsem);
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
goto out;
@@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
out:
- mutex_unlock(&uprobe->copy_mutex);
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ return !uc->filter || uc->filter(uc, ctx, mm);
+}
+
+static bool filter_chain(struct uprobe *uprobe,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct uprobe_consumer *uc;
+ bool ret = false;
+
+ down_read(&uprobe->consumer_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ ret = consumer_filter(uc, ctx, mm);
+ if (ret)
+ break;
+ }
+ up_read(&uprobe->consumer_rwsem);
return ret;
}
@@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
bool first_uprobe;
int ret;
- /*
- * If probe is being deleted, unregister thread could be done with
- * the vma-rmap-walk through. Adding a probe now can be fatal since
- * nobody will be able to cleanup. Also we could be from fork or
- * mremap path, where the probe might have already been inserted.
- * Hence behave as if probe already existed.
- */
- if (!uprobe->consumers)
- return 0;
-
ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
if (ret)
return ret;
@@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
- /* can happen if uprobe_register() fails */
- if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
- return 0;
-
set_bit(MMF_RECALC_UPROBES, &mm->flags);
return set_orig_insn(&uprobe->arch, mm, vaddr);
}
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
/*
* There could be threads that have already hit the breakpoint. They
* will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
*/
static void delete_uprobe(struct uprobe *uprobe)
{
+ if (WARN_ON(!uprobe_is_active(uprobe)))
+ return;
+
spin_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
spin_unlock(&uprobes_treelock);
+ RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
iput(uprobe->inode);
put_uprobe(uprobe);
- atomic_dec(&uprobe_events);
}
struct map_info {
@@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
return curr;
}
-static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
+ bool is_register = !!new;
struct map_info *info;
int err = 0;
@@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
goto unlock;
- if (is_register)
- err = install_breakpoint(uprobe, mm, vma, info->vaddr);
- else
- err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ if (is_register) {
+ /* consult only the "caller", new consumer. */
+ if (consumer_filter(new,
+ UPROBE_FILTER_REGISTER, mm))
+ err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+ if (!filter_chain(uprobe,
+ UPROBE_FILTER_UNREGISTER, mm))
+ err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ }
unlock:
up_write(&mm->mmap_sem);
@@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
return err;
}
-static int __uprobe_register(struct uprobe *uprobe)
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- return register_for_each_vma(uprobe, true);
+ consumer_add(uprobe, uc);
+ return register_for_each_vma(uprobe, uc);
}
-static void __uprobe_unregister(struct uprobe *uprobe)
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- if (!register_for_each_vma(uprobe, false))
- delete_uprobe(uprobe);
+ int err;
+
+ if (!consumer_del(uprobe, uc)) /* WARN? */
+ return;
+ err = register_for_each_vma(uprobe, NULL);
/* TODO : cant unregister? schedule a worker thread */
+ if (!uprobe->consumers && !err)
+ delete_uprobe(uprobe);
}
/*
@@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
struct uprobe *uprobe;
int ret;
- if (!inode || !uc || uc->next)
- return -EINVAL;
-
+ /* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
- ret = 0;
- mutex_lock(uprobes_hash(inode));
+ retry:
uprobe = alloc_uprobe(inode, offset);
-
- if (!uprobe) {
- ret = -ENOMEM;
- } else if (!consumer_add(uprobe, uc)) {
- ret = __uprobe_register(uprobe);
- if (ret) {
- uprobe->consumers = NULL;
- __uprobe_unregister(uprobe);
- } else {
- set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
- }
+ if (!uprobe)
+ return -ENOMEM;
+ /*
+ * We can race with uprobe_unregister()->delete_uprobe().
+ * Check uprobe_is_active() and retry if it is false.
+ */
+ down_write(&uprobe->register_rwsem);
+ ret = -EAGAIN;
+ if (likely(uprobe_is_active(uprobe))) {
+ ret = __uprobe_register(uprobe, uc);
+ if (ret)
+ __uprobe_unregister(uprobe, uc);
}
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
- mutex_unlock(uprobes_hash(inode));
- if (uprobe)
- put_uprobe(uprobe);
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+ struct uprobe_consumer *uc, bool add)
+{
+ struct uprobe *uprobe;
+ struct uprobe_consumer *con;
+ int ret = -ENOENT;
+
+ uprobe = find_uprobe(inode, offset);
+ if (!uprobe)
+ return ret;
+
+ down_write(&uprobe->register_rwsem);
+ for (con = uprobe->consumers; con && con != uc ; con = con->next)
+ ;
+ if (con)
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
return ret;
}
@@ -884,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
{
struct uprobe *uprobe;
- if (!inode || !uc)
- return;
-
uprobe = find_uprobe(inode, offset);
if (!uprobe)
return;
- mutex_lock(uprobes_hash(inode));
+ down_write(&uprobe->register_rwsem);
+ __uprobe_unregister(uprobe, uc);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
- if (consumer_del(uprobe, uc)) {
- if (!uprobe->consumers) {
- __uprobe_unregister(uprobe);
- clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
- }
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long vaddr;
+ loff_t offset;
+
+ if (!valid_vma(vma, false) ||
+ vma->vm_file->f_mapping->host != uprobe->inode)
+ continue;
+
+ offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+ if (uprobe->offset < offset ||
+ uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+ continue;
+
+ vaddr = offset_to_vaddr(vma, uprobe->offset);
+ err |= remove_breakpoint(uprobe, mm, vaddr);
}
+ up_read(&mm->mmap_sem);
- mutex_unlock(uprobes_hash(inode));
- if (uprobe)
- put_uprobe(uprobe);
+ return err;
}
static struct rb_node *
@@ -979,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
struct uprobe *uprobe, *u;
struct inode *inode;
- if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+ if (no_uprobe_events() || !valid_vma(vma, true))
return 0;
inode = vma->vm_file->f_mapping->host;
@@ -988,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
mutex_lock(uprobes_mmap_hash(inode));
build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
-
+ /*
+ * We can race with uprobe_unregister(), this uprobe can be already
+ * removed. But in this case filter_chain() must return false, all
+ * consumers have gone away.
+ */
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
- if (!fatal_signal_pending(current)) {
+ if (!fatal_signal_pending(current) &&
+ filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
@@ -1025,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
*/
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
- if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+ if (no_uprobe_events() || !valid_vma(vma, false))
return;
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1042,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
/* Slot allocation for XOL */
static int xol_add_vma(struct xol_area *area)
{
- struct mm_struct *mm;
- int ret;
-
- area->page = alloc_page(GFP_HIGHUSER);
- if (!area->page)
- return -ENOMEM;
-
- ret = -EALREADY;
- mm = current->mm;
+ struct mm_struct *mm = current->mm;
+ int ret = -EALREADY;
down_write(&mm->mmap_sem);
if (mm->uprobes_state.xol_area)
goto fail;
ret = -ENOMEM;
-
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
if (area->vaddr & ~PAGE_MASK) {
@@ -1073,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
ret = 0;
-
-fail:
+ fail:
up_write(&mm->mmap_sem);
- if (ret)
- __free_page(area->page);
return ret;
}
-static struct xol_area *get_xol_area(struct mm_struct *mm)
-{
- struct xol_area *area;
-
- area = mm->uprobes_state.xol_area;
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
-
- return area;
-}
-
/*
- * xol_alloc_area - Allocate process's xol_area.
- * This area will be used for storing instructions for execution out of
- * line.
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
*
* Returns the allocated area or NULL.
*/
-static struct xol_area *xol_alloc_area(void)
+static struct xol_area *get_xol_area(void)
{
+ struct mm_struct *mm = current->mm;
struct xol_area *area;
+ area = mm->uprobes_state.xol_area;
+ if (area)
+ goto ret;
+
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
- return NULL;
+ goto out;
area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
-
if (!area->bitmap)
- goto fail;
+ goto free_area;
+
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
+ goto free_bitmap;
init_waitqueue_head(&area->wq);
if (!xol_add_vma(area))
return area;
-fail:
+ __free_page(area->page);
+ free_bitmap:
kfree(area->bitmap);
+ free_area:
kfree(area);
-
- return get_xol_area(current->mm);
+ out:
+ area = mm->uprobes_state.xol_area;
+ ret:
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ return area;
}
/*
@@ -1186,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
}
/*
- * xol_get_insn_slot - If was not allocated a slot, then
- * allocate a slot.
+ * xol_get_insn_slot - allocate a slot for xol.
* Returns the allocated slot address or 0.
*/
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
struct xol_area *area;
unsigned long offset;
+ unsigned long xol_vaddr;
void *vaddr;
- area = get_xol_area(current->mm);
- if (!area) {
- area = xol_alloc_area();
- if (!area)
- return 0;
- }
- current->utask->xol_vaddr = xol_take_insn_slot(area);
+ area = get_xol_area();
+ if (!area)
+ return 0;
- /*
- * Initialize the slot if xol_vaddr points to valid
- * instruction slot.
- */
- if (unlikely(!current->utask->xol_vaddr))
+ xol_vaddr = xol_take_insn_slot(area);
+ if (unlikely(!xol_vaddr))
return 0;
- current->utask->vaddr = slot_addr;
- offset = current->utask->xol_vaddr & ~PAGE_MASK;
+ /* Initialize the slot */
+ offset = xol_vaddr & ~PAGE_MASK;
vaddr = kmap_atomic(area->page);
memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
kunmap_atomic(vaddr);
@@ -1222,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
*/
flush_dcache_page(area->page);
- return current->utask->xol_vaddr;
+ return xol_vaddr;
}
/*
@@ -1240,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
return;
slot_addr = tsk->utask->xol_vaddr;
-
- if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
+ if (unlikely(!slot_addr))
return;
area = tsk->mm->uprobes_state.xol_area;
@@ -1303,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
}
/*
- * Allocate a uprobe_task object for the task.
- * Called when the thread hits a breakpoint for the first time.
+ * Allocate a uprobe_task object for the task if if necessary.
+ * Called when the thread hits a breakpoint.
*
* Returns:
* - pointer to new uprobe_task on success
* - NULL otherwise
*/
-static struct uprobe_task *add_utask(void)
+static struct uprobe_task *get_utask(void)
{
- struct uprobe_task *utask;
-
- utask = kzalloc(sizeof *utask, GFP_KERNEL);
- if (unlikely(!utask))
- return NULL;
-
- current->utask = utask;
- return utask;
+ if (!current->utask)
+ current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ return current->utask;
}
/* Prepare to single-step probed instruction out of line. */
static int
-pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
- if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
- return 0;
+ struct uprobe_task *utask;
+ unsigned long xol_vaddr;
+ int err;
+
+ utask = get_utask();
+ if (!utask)
+ return -ENOMEM;
+
+ xol_vaddr = xol_get_insn_slot(uprobe);
+ if (!xol_vaddr)
+ return -ENOMEM;
+
+ utask->xol_vaddr = xol_vaddr;
+ utask->vaddr = bp_vaddr;
+
+ err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+ if (unlikely(err)) {
+ xol_free_insn_slot(current);
+ return err;
+ }
- return -EFAULT;
+ utask->active_uprobe = uprobe;
+ utask->state = UTASK_SSTEP;
+ return 0;
}
/*
@@ -1391,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
* This is not strictly accurate, we can race with
* uprobe_unregister() and see the already removed
* uprobe if delete_uprobe() was not yet called.
+ * Or this uprobe can be filtered out.
*/
if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
return;
@@ -1452,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
return uprobe;
}
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct uprobe_consumer *uc;
+ int remove = UPROBE_HANDLER_REMOVE;
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ int rc = uc->handler(uc, regs);
+
+ WARN(rc & ~UPROBE_HANDLER_MASK,
+ "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ remove &= rc;
+ }
+
+ if (remove && uprobe->consumers) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
*/
static void handle_swbp(struct pt_regs *regs)
{
- struct uprobe_task *utask;
struct uprobe *uprobe;
unsigned long bp_vaddr;
int uninitialized_var(is_swbp);
@@ -1483,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
}
return;
}
+
+ /* change it in advance for ->handler() and restart */
+ instruction_pointer_set(regs, bp_vaddr);
+
/*
* TODO: move copy_insn/etc into _register and remove this hack.
* After we hit the bp, _unregister + _register can install the
@@ -1490,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
*/
smp_rmb(); /* pairs with wmb() in install_breakpoint() */
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
- goto restart;
-
- utask = current->utask;
- if (!utask) {
- utask = add_utask();
- /* Cannot allocate; re-execute the instruction. */
- if (!utask)
- goto restart;
- }
+ goto out;
handler_chain(uprobe, regs);
if (can_skip_sstep(uprobe, regs))
goto out;
- if (!pre_ssout(uprobe, regs, bp_vaddr)) {
- utask->active_uprobe = uprobe;
- utask->state = UTASK_SSTEP;
+ if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
- }
-restart:
- /*
- * cannot singlestep; cannot skip instruction;
- * re-execute the instruction.
- */
- instruction_pointer_set(regs, bp_vaddr);
+ /* can_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
}
@@ -1609,10 +1649,8 @@ static int __init init_uprobes(void)
{
int i;
- for (i = 0; i < UPROBES_HASH_SZ; i++) {
- mutex_init(&uprobes_mutex[i]);
+ for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
- }
if (percpu_init_rwsem(&dup_mmap_sem))
return -ENOMEM;
diff --git a/kernel/exit.c b/kernel/exit.c
index 50d2e93c36ea..51e485ca9935 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
-#include <linux/freezer.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
- /*
- * If we are the last child process in a pid namespace to be
- * reaped, notify the reaper sleeping zap_pid_ns_processes().
- */
- if (IS_ENABLED(CONFIG_PID_NS)) {
- struct task_struct *parent = p->real_parent;
-
- if ((task_active_pid_ns(parent)->child_reaper == parent) &&
- list_empty(&parent->children) &&
- (parent->flags & PF_EXITING))
- wake_up_process(parent);
- }
}
list_del_rcu(&p->thread_group);
}
@@ -97,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk)
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
struct tty_struct *uninitialized_var(tty);
+ cputime_t utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
@@ -135,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime += tsk->utime;
- sig->stime += tsk->stime;
- sig->gtime += tsk->gtime;
+ task_cputime(tsk, &utime, &stime);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -495,7 +485,7 @@ static void exit_mm(struct task_struct * tsk)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
- schedule();
+ freezable_schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
@@ -845,7 +835,7 @@ void do_exit(long code)
/*
* Make sure we are holding no locks:
*/
- debug_check_no_locks_held(tsk);
+ debug_check_no_locks_held();
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
@@ -1104,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
sig = p->signal;
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
- psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
+ psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index 3c31e874afad..8d932b1c9056 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
- struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
static inline void free_thread_info(struct thread_info *ti)
{
- free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
@@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_next = tmp->vm_prev = NULL;
file = tmp->vm_file;
if (file) {
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
@@ -823,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ mm->first_nid = NUMA_PTE_SCAN_INIT;
+#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -1041,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
atomic_set(&sig->live, 1);
atomic_set(&sig->sigcnt, 1);
init_waitqueue_head(&sig->wait_chldexit);
- if (clone_flags & CLONE_NEWPID)
- sig->flags |= SIGNAL_UNKILLABLE;
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
@@ -1165,6 +1166,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
+ /*
+ * If the new process will be in a different pid namespace
+ * don't allow the creation of threads.
+ */
+ if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
+ (task_active_pid_ns(current) != current->nsproxy->pid_ns))
+ return ERR_PTR(-EINVAL);
+
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
@@ -1224,6 +1233,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ seqlock_init(&p->vtime_seqlock);
+ p->vtime_snap = 0;
+ p->vtime_snap_whence = VTIME_SLEEPING;
+#endif
+
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
@@ -1435,8 +1450,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
if (thread_group_leader(p)) {
- if (is_child_reaper(pid))
- p->nsproxy->pid_ns->child_reaper = p;
+ if (is_child_reaper(pid)) {
+ ns_of_pid(pid)->child_reaper = p;
+ p->signal->flags |= SIGNAL_UNKILLABLE;
+ }
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1470,8 +1487,6 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
- if (unlikely(clone_flags & CLONE_NEWPID))
- pid_ns_release_proc(p->nsproxy->pid_ns);
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
@@ -1551,15 +1566,9 @@ long do_fork(unsigned long clone_flags,
* Do some preliminary argument and permissions checking before we
* actually start allocating stuff
*/
- if (clone_flags & CLONE_NEWUSER) {
- if (clone_flags & CLONE_THREAD)
+ if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
+ if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
return -EINVAL;
- /* hopefully this check will go away when userns support is
- * complete
- */
- if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
- !capable(CAP_SETGID))
- return -EPERM;
}
/*
@@ -1618,7 +1627,6 @@ long do_fork(unsigned long clone_flags,
return nr;
}
-#ifdef CONFIG_GENERIC_KERNEL_THREAD
/*
* Create a kernel thread.
*/
@@ -1627,7 +1635,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
(unsigned long)arg, NULL, NULL);
}
-#endif
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
@@ -1667,8 +1674,10 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int, tls_val)
#endif
{
- return do_fork(clone_flags, newsp, 0,
- parent_tidptr, child_tidptr);
+ long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+ asmlinkage_protect(5, ret, clone_flags, newsp,
+ parent_tidptr, child_tidptr, tls_val);
+ return ret;
}
#endif
@@ -1721,7 +1730,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
{
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+ CLONE_NEWUSER|CLONE_NEWPID))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing to
@@ -1788,19 +1798,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
struct files_struct *fd, *new_fd = NULL;
+ struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
int err;
- err = check_unshare_flags(unshare_flags);
- if (err)
- goto bad_unshare_out;
-
+ /*
+ * If unsharing a user namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWUSER)
+ unshare_flags |= CLONE_THREAD;
+ /*
+ * If unsharing a pid namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWPID)
+ unshare_flags |= CLONE_THREAD;
+ /*
+ * If unsharing a thread from a thread group, must also unshare vm.
+ */
+ if (unshare_flags & CLONE_THREAD)
+ unshare_flags |= CLONE_VM;
+ /*
+ * If unsharing vm, must also unshare signal handlers.
+ */
+ if (unshare_flags & CLONE_VM)
+ unshare_flags |= CLONE_SIGHAND;
/*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
+
+ err = check_unshare_flags(unshare_flags);
+ if (err)
+ goto bad_unshare_out;
/*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
@@ -1814,11 +1845,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
err = unshare_fd(unshare_flags, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
- err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+ err = unshare_userns(unshare_flags, &new_cred);
if (err)
goto bad_unshare_cleanup_fd;
+ err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_cred, new_fs);
+ if (err)
+ goto bad_unshare_cleanup_cred;
- if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1826,10 +1861,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
exit_sem(current);
}
- if (new_nsproxy) {
+ if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
- new_nsproxy = NULL;
- }
task_lock(current);
@@ -1851,11 +1884,17 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
task_unlock(current);
- }
- if (new_nsproxy)
- put_nsproxy(new_nsproxy);
+ if (new_cred) {
+ /* Install the new user namespace */
+ commit_creds(new_cred);
+ new_cred = NULL;
+ }
+ }
+bad_unshare_cleanup_cred:
+ if (new_cred)
+ put_cred(new_cred);
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089ca003..f0090a993dab 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,7 @@
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
+#include <linux/sched/rt.h>
#include <asm/futex.h>
@@ -225,7 +226,7 @@ static void drop_futex_key_refs(union futex_key *key)
* Returns a negative error code or 0
* The key words are stored in *key on success.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
@@ -2471,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
if (!futex_cmpxchg_enabled)
return -ENOSYS;
- WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
rcu_read_lock();
ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005fc..f9f44fd4d34d 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -11,6 +11,7 @@
#include <linux/nsproxy.h>
#include <linux/futex.h>
#include <linux/ptrace.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
@@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
}
}
-asmlinkage long
-compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
- compat_size_t len)
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
{
if (!futex_cmpxchg_enabled)
return -ENOSYS;
@@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
return 0;
}
-asmlinkage long
-compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
- compat_size_t __user *len_ptr)
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
{
struct compat_robust_list_head __user *head;
unsigned long ret;
@@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
if (!futex_cmpxchg_enabled)
return -ENOSYS;
- WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
rcu_read_lock();
ret = -ESRCH;
@@ -172,9 +171,9 @@ err_unlock:
return ret;
}
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
- struct compat_timespec __user *utime, u32 __user *uaddr2,
- u32 val3)
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
{
struct timespec ts;
ktime_t t, *tp = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc1..d4da55d1fb65 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+ depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b5..cc47812d3feb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,6 +44,8 @@
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -640,21 +642,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
* and expiry check is done in the hrtimer_interrupt or in the softirq.
*/
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
+ struct hrtimer_clock_base *base)
{
- if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
- if (wakeup) {
- raw_spin_unlock(&base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- raw_spin_lock(&base->cpu_base->lock);
- } else
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-
- return 1;
- }
-
- return 0;
+ return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
}
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -735,8 +725,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
+ struct hrtimer_clock_base *base)
{
return 0;
}
@@ -995,8 +984,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
*
* XXX send_remote_softirq() ?
*/
- if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
- hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+ if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
+ && hrtimer_enqueue_reprogram(timer, new_base)) {
+ if (wakeup) {
+ /*
+ * We need to drop cpu_base->lock to avoid a
+ * lock ordering issue vs. rq->lock.
+ */
+ raw_spin_unlock(&new_base->cpu_base->lock);
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ local_irq_restore(flags);
+ return ret;
+ } else {
+ __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+ }
unlock_hrtimer_base(timer, &flags);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3aca9f29d30e..cbd97ce0b000 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
EXPORT_SYMBOL(irq_set_handler_data);
/**
- * irq_set_msi_desc - set MSI descriptor data for an irq
- * @irq: Interrupt number
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ * @irq_base: Interrupt number base
+ * @irq_offset: Interrupt number offset
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq
+ * Set the MSI descriptor entry for an irq at offset
*/
-int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+ struct msi_desc *entry)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
if (!desc)
return -EINVAL;
desc->irq_data.msi_desc = entry;
- if (entry)
- entry->irq = irq;
+ if (entry && !irq_offset)
+ entry->irq = irq_base;
irq_put_desc_unlock(desc, flags);
return 0;
}
/**
+ * irq_set_msi_desc - set MSI descriptor data for an irq
+ * @irq: Interrupt number
+ * @entry: Pointer to MSI descriptor data
+ *
+ * Set the MSI descriptor entry for an irq
+ */
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+{
+ return irq_set_msi_desc_off(irq, 0, entry);
+}
+
+/**
* irq_set_chip_data - set irq chip data for an irq
* @irq: Interrupt number
* @data: Pointer to chip specific data
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 35c70c9e24d8..fa17855ca65a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/task_work.h>
#include "internals.h"
@@ -818,7 +819,7 @@ static void irq_thread_dtor(struct callback_head *unused)
action = kthread_data(tsk);
pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
- tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
+ tsk->comm, tsk->pid, action->irq);
desc = irq_to_desc(action->irq);
@@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
out:
irq_put_desc_unlock(desc, flags);
}
+EXPORT_SYMBOL_GPL(enable_percpu_irq);
void disable_percpu_irq(unsigned int irq)
{
@@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq)
irq_percpu_disable(desc, cpu);
irq_put_desc_unlock(desc, flags);
}
+EXPORT_SYMBOL_GPL(disable_percpu_irq);
/*
* Internal function to unregister a percpu irqaction.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa6323a..397db02209ed 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+ unsigned int irq = (int)(long)PDE(file_inode(file))->data;
cpumask_var_t new_value;
int err;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c45..7b5f012bde9d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
/*
* All handlers must agree on IRQF_SHARED, so we test just the
- * first. Check for action->next as well.
+ * first.
*/
action = desc->action;
if (!action || !(action->flags & IRQF_SHARED) ||
- (action->flags & __IRQF_TIMER) ||
- (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
- !action->next)
+ (action->flags & __IRQF_TIMER))
goto out;
/* Already running on another processor */
@@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
do {
if (handle_irq_event(desc) == IRQ_HANDLED)
ret = IRQ_HANDLED;
+ /* Make sure that there is still a valid action */
action = desc->action;
} while ((desc->istate & IRQS_PENDING) && action);
desc->istate &= ~IRQS_POLL_INPROGRESS;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871b..55fcce6065cf 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -12,37 +12,36 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
#include <asm/processor.h>
-/*
- * An entry can be in one of four states:
- *
- * free NULL, 0 -> {claimed} : free to be used
- * claimed NULL, 3 -> {pending} : claimed to be enqueued
- * pending next, 3 -> {busy} : queued, pending callback
- * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- */
-
-#define IRQ_WORK_PENDING 1UL
-#define IRQ_WORK_BUSY 2UL
-#define IRQ_WORK_FLAGS 3UL
static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+static DEFINE_PER_CPU(int, irq_work_raised);
/*
* Claim the entry so that no one else will poke at it.
*/
static bool irq_work_claim(struct irq_work *work)
{
- unsigned long flags, nflags;
+ unsigned long flags, oflags, nflags;
+ /*
+ * Start with our best wish as a premise but only trust any
+ * flag value after cmpxchg() result.
+ */
+ flags = work->flags & ~IRQ_WORK_PENDING;
for (;;) {
- flags = work->flags;
- if (flags & IRQ_WORK_PENDING)
- return false;
nflags = flags | IRQ_WORK_FLAGS;
- if (cmpxchg(&work->flags, flags, nflags) == flags)
+ oflags = cmpxchg(&work->flags, flags, nflags);
+ if (oflags == flags)
break;
+ if (oflags & IRQ_WORK_PENDING)
+ return false;
+ flags = oflags;
cpu_relax();
}
@@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void)
}
/*
- * Queue the entry and raise the IPI if needed.
+ * Enqueue the irq_work @entry unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
*/
-static void __irq_work_queue(struct irq_work *work)
+void irq_work_queue(struct irq_work *work)
{
- bool empty;
+ /* Only queue if not already pending */
+ if (!irq_work_claim(work))
+ return;
+ /* Queue the entry and raise the IPI if needed. */
preempt_disable();
- empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
- /* The list was empty, raise self-interrupt to start processing. */
- if (empty)
- arch_irq_work_raise();
+ llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
+
+ /*
+ * If the work is not "lazy" or the tick is stopped, raise the irq
+ * work interrupt (if supported by the arch), otherwise, just wait
+ * for the next tick.
+ */
+ if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
+ if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+ arch_irq_work_raise();
+ }
preempt_enable();
}
+EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
- * Enqueue the irq_work @entry, returns true on success, failure when the
- * @entry was already enqueued by someone else.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue(struct irq_work *work)
+bool irq_work_needs_cpu(void)
{
- if (!irq_work_claim(work)) {
- /*
- * Already enqueued, can't do!
- */
+ struct llist_head *this_list;
+
+ this_list = &__get_cpu_var(irq_work_list);
+ if (llist_empty(this_list))
return false;
- }
- __irq_work_queue(work);
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
+
return true;
}
-EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
- */
-void irq_work_run(void)
+static void __irq_work_run(void)
{
+ unsigned long flags;
struct irq_work *work;
struct llist_head *this_list;
struct llist_node *llnode;
+
+ /*
+ * Reset the "raised" state right before we check the list because
+ * an NMI may enqueue after we find the list empty from the runner.
+ */
+ __this_cpu_write(irq_work_raised, 0);
+ barrier();
+
this_list = &__get_cpu_var(irq_work_list);
if (llist_empty(this_list))
return;
- BUG_ON(!in_irq());
BUG_ON(!irqs_disabled());
llnode = llist_del_all(this_list);
@@ -119,16 +130,31 @@ void irq_work_run(void)
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
+ * Make it immediately visible so that other CPUs trying
+ * to claim that work don't rely on us to handle their data
+ * while we are in the middle of the func.
*/
- work->flags = IRQ_WORK_BUSY;
+ flags = work->flags & ~IRQ_WORK_PENDING;
+ xchg(&work->flags, flags);
+
work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
- (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
+ (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
}
}
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+ BUG_ON(!in_irq());
+ __irq_work_run();
+}
EXPORT_SYMBOL_GPL(irq_work_run);
/*
@@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int irq_work_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_DYING:
+ /* Called from stop_machine */
+ if (WARN_ON_ONCE(cpu != smp_processor_id()))
+ break;
+ __irq_work_run();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_notify;
+
+static __init int irq_work_init_cpu_notifier(void)
+{
+ cpu_notify.notifier_call = irq_work_cpu_notify;
+ cpu_notify.priority = 0;
+ register_cpu_notifier(&cpu_notify);
+ return 0;
+}
+device_initcall(irq_work_init_cpu_notifier);
+
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 30b7b225306c..e30ac0fe61c3 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -4,6 +4,7 @@
#include <linux/string.h>
#include <linux/random.h>
#include <linux/module.h>
+#include <linux/ptrace.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/cache.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5d..bddd3d7a74b6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -54,6 +54,12 @@ struct resource crashk_res = {
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
+struct resource crashk_low_res = {
+ .name = "Crash kernel low",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
int kexec_should_crash(struct task_struct *p)
{
@@ -223,6 +229,8 @@ out:
}
+static void kimage_free_page_list(struct list_head *list);
+
static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
struct kexec_segment __user *segments)
@@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
if (result)
goto out;
- *rimage = image;
-
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
@@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out;
+ goto out_free;
}
image->swap_page = kimage_alloc_control_pages(image, 0);
if (!image->swap_page) {
printk(KERN_ERR "Could not allocate swap buffer\n");
- goto out;
+ goto out_free;
}
- result = 0;
- out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ *rimage = image;
+ return 0;
+out_free:
+ kimage_free_page_list(&image->control_pages);
+ kfree(image);
+out:
return result;
}
@@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
mend = mstart + image->segment[i].memsz - 1;
/* Ensure we are within the crash kernel limits */
if ((mstart < crashk_res.start) || (mend > crashk_res.end))
- goto out;
+ goto out_free;
}
/*
@@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out;
+ goto out_free;
}
- result = 0;
-out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ *rimage = image;
+ return 0;
+out_free:
+ kfree(image);
+out:
return result;
}
@@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
break;
- if (hole_end > crashk_res.end)
- break;
/* See if I overlap any of the segments */
for (i = 0; i < image->nr_segments; i++) {
unsigned long mstart, mend;
@@ -1369,10 +1372,11 @@ static int __init parse_crashkernel_simple(char *cmdline,
* That function is the entry point for command line parsing and should be
* called from the arch-specific code.
*/
-int __init parse_crashkernel(char *cmdline,
+static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
- unsigned long long *crash_base)
+ unsigned long long *crash_base,
+ const char *name)
{
char *p = cmdline, *ck_cmdline = NULL;
char *first_colon, *first_space;
@@ -1382,16 +1386,16 @@ int __init parse_crashkernel(char *cmdline,
*crash_base = 0;
/* find crashkernel and use the last one if there are more */
- p = strstr(p, "crashkernel=");
+ p = strstr(p, name);
while (p) {
ck_cmdline = p;
- p = strstr(p+1, "crashkernel=");
+ p = strstr(p+1, name);
}
if (!ck_cmdline)
return -EINVAL;
- ck_cmdline += 12; /* strlen("crashkernel=") */
+ ck_cmdline += strlen(name);
/*
* if the commandline contains a ':', then that's the extended
@@ -1409,6 +1413,23 @@ int __init parse_crashkernel(char *cmdline,
return 0;
}
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=");
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel_low=");
+}
static void update_vmcoreinfo_note(void)
{
@@ -1490,6 +1511,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, _count);
VMCOREINFO_OFFSET(page, mapping);
VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1512,6 +1535,11 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_lru);
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index 59dcf5b81d24..000000000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,609 +0,0 @@
-/*
- * A generic kernel FIFO implementation
- *
- * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/log2.h>
-#include <linux/uaccess.h>
-#include <linux/kfifo.h>
-
-/*
- * internal helper to calculate the unused elements in a fifo
- */
-static inline unsigned int kfifo_unused(struct __kfifo *fifo)
-{
- return (fifo->mask + 1) - (fifo->in - fifo->out);
-}
-
-int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
- size_t esize, gfp_t gfp_mask)
-{
- /*
- * round down to the next power of 2, since our 'let the indices
- * wrap' technique works only in this case.
- */
- if (!is_power_of_2(size))
- size = rounddown_pow_of_two(size);
-
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = esize;
-
- if (size < 2) {
- fifo->data = NULL;
- fifo->mask = 0;
- return -EINVAL;
- }
-
- fifo->data = kmalloc(size * esize, gfp_mask);
-
- if (!fifo->data) {
- fifo->mask = 0;
- return -ENOMEM;
- }
- fifo->mask = size - 1;
-
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_alloc);
-
-void __kfifo_free(struct __kfifo *fifo)
-{
- kfree(fifo->data);
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = 0;
- fifo->data = NULL;
- fifo->mask = 0;
-}
-EXPORT_SYMBOL(__kfifo_free);
-
-int __kfifo_init(struct __kfifo *fifo, void *buffer,
- unsigned int size, size_t esize)
-{
- size /= esize;
-
- if (!is_power_of_2(size))
- size = rounddown_pow_of_two(size);
-
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = esize;
- fifo->data = buffer;
-
- if (size < 2) {
- fifo->mask = 0;
- return -EINVAL;
- }
- fifo->mask = size - 1;
-
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_init);
-
-static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
- unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- memcpy(fifo->data + off, src, l);
- memcpy(fifo->data, src + l, len - l);
- /*
- * make sure that the data in the fifo is up to date before
- * incrementing the fifo->in index counter
- */
- smp_wmb();
-}
-
-unsigned int __kfifo_in(struct __kfifo *fifo,
- const void *buf, unsigned int len)
-{
- unsigned int l;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- kfifo_copy_in(fifo, buf, len, fifo->in);
- fifo->in += len;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_in);
-
-static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
- unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- memcpy(dst, fifo->data + off, l);
- memcpy(dst + l, fifo->data, len - l);
- /*
- * make sure that the data is copied before
- * incrementing the fifo->out index counter
- */
- smp_wmb();
-}
-
-unsigned int __kfifo_out_peek(struct __kfifo *fifo,
- void *buf, unsigned int len)
-{
- unsigned int l;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
-
- kfifo_copy_out(fifo, buf, len, fifo->out);
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out_peek);
-
-unsigned int __kfifo_out(struct __kfifo *fifo,
- void *buf, unsigned int len)
-{
- len = __kfifo_out_peek(fifo, buf, len);
- fifo->out += len;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out);
-
-static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
- const void __user *from, unsigned int len, unsigned int off,
- unsigned int *copied)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
- unsigned long ret;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- ret = copy_from_user(fifo->data + off, from, l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret + len - l, esize);
- else {
- ret = copy_from_user(fifo->data, from + l, len - l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret, esize);
- }
- /*
- * make sure that the data in the fifo is up to date before
- * incrementing the fifo->in index counter
- */
- smp_wmb();
- *copied = len - ret;
- /* return the number of elements which are not copied */
- return ret;
-}
-
-int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
- unsigned long len, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int esize = fifo->esize;
- int err;
-
- if (esize != 1)
- len /= esize;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
- if (unlikely(ret)) {
- len -= ret;
- err = -EFAULT;
- } else
- err = 0;
- fifo->in += len;
- return err;
-}
-EXPORT_SYMBOL(__kfifo_from_user);
-
-static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
- unsigned int len, unsigned int off, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- ret = copy_to_user(to, fifo->data + off, l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret + len - l, esize);
- else {
- ret = copy_to_user(to + l, fifo->data, len - l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret, esize);
- }
- /*
- * make sure that the data is copied before
- * incrementing the fifo->out index counter
- */
- smp_wmb();
- *copied = len - ret;
- /* return the number of elements which are not copied */
- return ret;
-}
-
-int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
- unsigned long len, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int esize = fifo->esize;
- int err;
-
- if (esize != 1)
- len /= esize;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
- ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
- if (unlikely(ret)) {
- len -= ret;
- err = -EFAULT;
- } else
- err = 0;
- fifo->out += len;
- return err;
-}
-EXPORT_SYMBOL(__kfifo_to_user);
-
-static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
- int nents, unsigned int len)
-{
- int n;
- unsigned int l;
- unsigned int off;
- struct page *page;
-
- if (!nents)
- return 0;
-
- if (!len)
- return 0;
-
- n = 0;
- page = virt_to_page(buf);
- off = offset_in_page(buf);
- l = 0;
-
- while (len >= l + PAGE_SIZE - off) {
- struct page *npage;
-
- l += PAGE_SIZE;
- buf += PAGE_SIZE;
- npage = virt_to_page(buf);
- if (page_to_phys(page) != page_to_phys(npage) - l) {
- sg_set_page(sgl, page, l - off, off);
- sgl = sg_next(sgl);
- if (++n == nents || sgl == NULL)
- return n;
- page = npage;
- len -= l - off;
- l = off = 0;
- }
- }
- sg_set_page(sgl, page, len, off);
- return n + 1;
-}
-
-static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
- int nents, unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
- unsigned int n;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
- n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
-
- return n;
-}
-
-unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len)
-{
- unsigned int l;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->in);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare);
-
-unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len)
-{
- unsigned int l;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->out);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare);
-
-unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
-{
- unsigned int max = (1 << (recsize << 3)) - 1;
-
- if (len > max)
- return max;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_max_r);
-
-#define __KFIFO_PEEK(data, out, mask) \
- ((data)[(out) & (mask)])
-/*
- * __kfifo_peek_n internal helper function for determinate the length of
- * the next record in the fifo
- */
-static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int l;
- unsigned int mask = fifo->mask;
- unsigned char *data = fifo->data;
-
- l = __KFIFO_PEEK(data, fifo->out, mask);
-
- if (--recsize)
- l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
-
- return l;
-}
-
-#define __KFIFO_POKE(data, in, mask, val) \
- ( \
- (data)[(in) & (mask)] = (unsigned char)(val) \
- )
-
-/*
- * __kfifo_poke_n internal helper function for storeing the length of
- * the record into the fifo
- */
-static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
-{
- unsigned int mask = fifo->mask;
- unsigned char *data = fifo->data;
-
- __KFIFO_POKE(data, fifo->in, mask, n);
-
- if (recsize > 1)
- __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
-}
-
-unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
-{
- return __kfifo_peek_n(fifo, recsize);
-}
-EXPORT_SYMBOL(__kfifo_len_r);
-
-unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
- unsigned int len, size_t recsize)
-{
- if (len + recsize > kfifo_unused(fifo))
- return 0;
-
- __kfifo_poke_n(fifo, len, recsize);
-
- kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
- fifo->in += len + recsize;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_in_r);
-
-static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
- void *buf, unsigned int len, size_t recsize, unsigned int *n)
-{
- *n = __kfifo_peek_n(fifo, recsize);
-
- if (len > *n)
- len = *n;
-
- kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
- return len;
-}
-
-unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
- unsigned int len, size_t recsize)
-{
- unsigned int n;
-
- if (fifo->in == fifo->out)
- return 0;
-
- return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-}
-EXPORT_SYMBOL(__kfifo_out_peek_r);
-
-unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
- unsigned int len, size_t recsize)
-{
- unsigned int n;
-
- if (fifo->in == fifo->out)
- return 0;
-
- len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
- fifo->out += n + recsize;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out_r);
-
-void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int n;
-
- n = __kfifo_peek_n(fifo, recsize);
- fifo->out += n + recsize;
-}
-EXPORT_SYMBOL(__kfifo_skip_r);
-
-int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
- unsigned long len, unsigned int *copied, size_t recsize)
-{
- unsigned long ret;
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > kfifo_unused(fifo)) {
- *copied = 0;
- return 0;
- }
-
- __kfifo_poke_n(fifo, len, recsize);
-
- ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
- if (unlikely(ret)) {
- *copied = 0;
- return -EFAULT;
- }
- fifo->in += len + recsize;
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_from_user_r);
-
-int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
- unsigned long len, unsigned int *copied, size_t recsize)
-{
- unsigned long ret;
- unsigned int n;
-
- if (fifo->in == fifo->out) {
- *copied = 0;
- return 0;
- }
-
- n = __kfifo_peek_n(fifo, recsize);
- if (len > n)
- len = n;
-
- ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
- if (unlikely(ret)) {
- *copied = 0;
- return -EFAULT;
- }
- fifo->out += n + recsize;
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_to_user_r);
-
-unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
- if (!nents)
- BUG();
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > kfifo_unused(fifo))
- return 0;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
-
-void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
- unsigned int len, size_t recsize)
-{
- len = __kfifo_max_r(len, recsize);
- __kfifo_poke_n(fifo, len, recsize);
- fifo->in += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
-
-unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
- if (!nents)
- BUG();
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > fifo->in - fifo->out)
- return 0;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
-
-void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int len;
-
- len = __kfifo_peek_n(fifo, recsize);
- fifo->out += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1c317e386831..56dd34976d7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -38,6 +38,7 @@
#include <linux/suspend.h>
#include <linux/rwsem.h>
#include <linux/ptrace.h>
+#include <linux/async.h>
#include <asm/uaccess.h>
#include <trace/events/module.h>
@@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...)
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
+ /*
+ * We don't allow synchronous module loading from async. Module
+ * init may invoke async_synchronize_full() which will end up
+ * waiting for this task which already is waiting for the module
+ * loading to complete, leading to a deadlock.
+ */
+ WARN_ON_ONCE(wait && current_is_async());
+
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
@@ -219,9 +228,9 @@ static int ____call_usermodehelper(void *data)
commit_creds(new);
- retval = kernel_execve(sub_info->path,
- (const char *const *)sub_info->argv,
- (const char *const *)sub_info->envp);
+ retval = do_execve(sub_info->path,
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
if (!retval)
return 0;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa409..e35be53f6613 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void)
struct kprobe __kprobes *get_kprobe(void *addr)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (p->addr == addr)
return p;
}
@@ -471,7 +470,6 @@ static LIST_HEAD(unoptimizing_list);
static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
-static DECLARE_COMPLETION(optimizer_comp);
#define OPTIMIZE_DELAY 5
/*
@@ -552,8 +550,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
/* Start optimizer after OPTIMIZE_DELAY passed */
static __kprobes void kick_kprobe_optimizer(void)
{
- if (!delayed_work_pending(&optimizing_work))
- schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+ schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}
/* Kprobe jump optimizer */
@@ -592,16 +589,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
- else
- /* Wake up all waiters */
- complete_all(&optimizer_comp);
}
/* Wait for completing optimization and unoptimization */
static __kprobes void wait_for_kprobe_optimizer(void)
{
- if (delayed_work_pending(&optimizing_work))
- wait_for_completion(&optimizer_comp);
+ mutex_lock(&kprobe_mutex);
+
+ while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
+ mutex_unlock(&kprobe_mutex);
+
+ /* this will also make optimizing_work execute immmediately */
+ flush_delayed_work(&optimizing_work);
+ /* @optimizing_work might not have been queued yet, relax */
+ cpu_relax();
+
+ mutex_lock(&kprobe_mutex);
+ }
+
+ mutex_unlock(&kprobe_mutex);
}
/* Optimize kprobe if p is ready to be optimized */
@@ -792,7 +798,6 @@ out:
static void __kprobes optimize_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -803,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void)
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
@@ -814,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void)
static void __kprobes unoptimize_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -825,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void)
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (!kprobe_disabled(p))
unoptimize_kprobe(p, false);
}
@@ -919,7 +923,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
}
#endif /* CONFIG_OPTPROBES */
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +968,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
(unsigned long)p->addr, 1, 0);
WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
}
-#else /* !KPROBES_CAN_USE_FTRACE */
+#else /* !CONFIG_KPROBES_ON_FTRACE */
#define prepare_kprobe(p) arch_prepare_kprobe(p)
#define arm_kprobe_ftrace(p) do {} while (0)
#define disarm_kprobe_ftrace(p) do {} while (0)
@@ -1141,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
struct hlist_head *head, empty_rp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
unsigned long hash, flags = 0;
if (unlikely(!kprobes_initialized))
@@ -1152,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
kretprobe_table_lock(hash, &flags);
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task == tk)
recycle_rp_inst(ri, &empty_rp);
}
kretprobe_table_unlock(hash, &flags);
- hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
@@ -1166,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
static inline void free_rp_inst(struct kretprobe *rp)
{
struct kretprobe_instance *ri;
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
- hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+ hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
@@ -1178,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
{
unsigned long flags, hash;
struct kretprobe_instance *ri;
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
struct hlist_head *head;
/* No race here */
for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
kretprobe_table_lock(hash, &flags);
head = &kretprobe_inst_table[hash];
- hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+ hlist_for_each_entry_safe(ri, next, head, hlist) {
if (ri->rp == rp)
ri->rp = NULL;
}
@@ -1414,12 +1418,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
*/
ftrace_addr = ftrace_location((unsigned long)p->addr);
if (ftrace_addr) {
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
/* Given address is not on the instruction boundary */
if ((unsigned long)p->addr != ftrace_addr)
return -EILSEQ;
p->flags |= KPROBE_FLAG_FTRACE;
-#else /* !KPROBES_CAN_USE_FTRACE */
+#else /* !CONFIG_KPROBES_ON_FTRACE */
return -EINVAL;
#endif
}
@@ -2021,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
{
struct module *mod = data;
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
@@ -2038,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2185,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p, *kp;
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
@@ -2194,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
head = &kprobe_table[i];
preempt_disable();
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
sym = kallsyms_lookup((unsigned long)p->addr, NULL,
&offset, &modname, namebuf);
if (kprobe_aggrprobe(p)) {
@@ -2229,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = {
static void __kprobes arm_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -2242,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void)
/* Arming kprobes doesn't optimize kprobe itself */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (!kprobe_disabled(p))
arm_kprobe(p);
}
@@ -2258,7 +2259,6 @@ already_enabled:
static void __kprobes disarm_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -2275,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void)
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
disarm_kprobe(p, false);
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350d..259db207b5d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
- printk("BUG: MAX_LOCK_DEPTH too low!\n");
+ printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n",
+ curr->lockdep_depth, MAX_LOCK_DEPTH);
printk("turning off the locking correctness validator.\n");
+
+ lockdep_print_held_locks(current);
+ debug_show_all_locks();
dump_stack();
+
return 0;
}
@@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
static int
-print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
unsigned long ip)
{
if (!debug_locks_off())
@@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
return 0;
if (curr->lockdep_depth <= 0)
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
return 1;
}
@@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
goto found_it;
prev_hlock = hlock;
}
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
found_it:
lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr,
goto found_it;
prev_hlock = hlock;
}
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
found_it:
if (hlock->instance == lock)
@@ -4083,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
}
EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
-static void print_held_locks_bug(struct task_struct *curr)
+static void print_held_locks_bug(void)
{
if (!debug_locks_off())
return;
@@ -4092,22 +4097,21 @@ static void print_held_locks_bug(struct task_struct *curr)
printk("\n");
printk("=====================================\n");
- printk("[ BUG: lock held at task exit time! ]\n");
+ printk("[ BUG: %s/%d still has locks held! ]\n",
+ current->comm, task_pid_nr(current));
print_kernel_ident();
printk("-------------------------------------\n");
- printk("%s/%d is exiting with locks still held!\n",
- curr->comm, task_pid_nr(curr));
- lockdep_print_held_locks(curr);
-
+ lockdep_print_held_locks(current);
printk("\nstack backtrace:\n");
dump_stack();
}
-void debug_check_no_locks_held(struct task_struct *task)
+void debug_check_no_locks_held(void)
{
- if (unlikely(task->lockdep_depth > 0))
- print_held_locks_bug(task);
+ if (unlikely(current->lockdep_depth > 0))
+ print_held_locks_bug();
}
+EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
void debug_show_all_locks(void)
{
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
new file mode 100644
index 000000000000..246b4c6e6135
--- /dev/null
+++ b/kernel/modsign_certificate.S
@@ -0,0 +1,19 @@
+/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
+#ifndef SYMBOL_PREFIX
+#define ASM_SYMBOL(sym) sym
+#else
+#define PASTE2(x,y) x##y
+#define PASTE(x,y) PASTE2(x,y)
+#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
+#endif
+
+#define GLOBAL(name) \
+ .globl ASM_SYMBOL(name); \
+ ASM_SYMBOL(name):
+
+ .section ".init.data","aw"
+
+GLOBAL(modsign_certificate_list)
+ .incbin "signing_key.x509"
+ .incbin "extra_certificates"
+GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 767e559dfb10..2b6e69909c39 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -20,12 +20,6 @@ struct key *modsign_keyring;
extern __initdata const u8 modsign_certificate_list[];
extern __initdata const u8 modsign_certificate_list_end[];
-asm(".section .init.data,\"aw\"\n"
- SYMBOL_PREFIX "modsign_certificate_list:\n"
- ".incbin \"signing_key.x509\"\n"
- ".incbin \"extra_certificates\"\n"
- SYMBOL_PREFIX "modsign_certificate_list_end:"
- );
/*
* We need to make sure ccache doesn't cache the .o file as it doesn't notice
@@ -40,18 +34,15 @@ static __init int module_verify_init(void)
{
pr_notice("Initialise module verification\n");
- modsign_keyring = key_alloc(&key_type_keyring, ".module_sign",
- KUIDT_INIT(0), KGIDT_INIT(0),
- current_cred(),
- (KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA);
+ modsign_keyring = keyring_alloc(".module_sign",
+ KUIDT_INIT(0), KGIDT_INIT(0),
+ current_cred(),
+ ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ),
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
if (IS_ERR(modsign_keyring))
panic("Can't allocate module signing keyring\n");
- if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0)
- panic("Can't instantiate module signing keyring\n");
-
return 0;
}
diff --git a/kernel/module.c b/kernel/module.c
index 6e48c3a43599..0925c9a71975 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -21,6 +21,7 @@
#include <linux/ftrace_event.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
+#include <linux/file.h>
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kernel.h>
@@ -28,6 +29,7 @@
#include <linux/vmalloc.h>
#include <linux/elf.h>
#include <linux/proc_fs.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
@@ -59,6 +61,7 @@
#include <linux/pfn.h>
#include <linux/bsearch.h>
#include <linux/fips.h>
+#include <uapi/linux/module.h>
#include "module-internal.h"
#define CREATE_TRACE_POINTS
@@ -185,6 +188,7 @@ struct load_info {
ongoing or failed initialization etc. */
static inline int strong_try_module_get(struct module *mod)
{
+ BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
if (mod && mod->state == MODULE_STATE_COMING)
return -EBUSY;
if (try_module_get(mod))
@@ -193,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)
return -ENOENT;
}
-static inline void add_taint_module(struct module *mod, unsigned flag)
+static inline void add_taint_module(struct module *mod, unsigned flag,
+ enum lockdep_ok lockdep_ok)
{
- add_taint(flag);
+ add_taint(flag, lockdep_ok);
mod->taints |= (1U << flag);
}
@@ -340,6 +345,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
#endif
};
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
return true;
}
@@ -372,9 +380,6 @@ static bool check_symbol(const struct symsearch *syms,
printk(KERN_WARNING "Symbol %s is being used "
"by a non-GPL module, which will not "
"be allowed in the future\n", fsa->name);
- printk(KERN_WARNING "Please see the file "
- "Documentation/feature-removal-schedule.txt "
- "in the kernel source tree for more details.\n");
}
}
@@ -450,16 +455,24 @@ const struct kernel_symbol *find_symbol(const char *name,
EXPORT_SYMBOL_GPL(find_symbol);
/* Search for module by name: must hold module_mutex. */
-struct module *find_module(const char *name)
+static struct module *find_module_all(const char *name,
+ bool even_unformed)
{
struct module *mod;
list_for_each_entry(mod, &modules, list) {
+ if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (strcmp(mod->name, name) == 0)
return mod;
}
return NULL;
}
+
+struct module *find_module(const char *name)
+{
+ return find_module_all(name, false);
+}
EXPORT_SYMBOL_GPL(find_module);
#ifdef CONFIG_SMP
@@ -525,6 +538,8 @@ bool is_module_percpu_address(unsigned long addr)
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (!mod->percpu_size)
continue;
for_each_possible_cpu(cpu) {
@@ -713,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)
{
int ret = (flags & O_TRUNC);
if (ret)
- add_taint(TAINT_FORCED_RMMOD);
+ add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
return ret;
}
#else
@@ -1048,6 +1063,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
case MODULE_STATE_GOING:
state = "going";
break;
+ default:
+ BUG();
}
return sprintf(buffer, "%s\n", state);
}
@@ -1122,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
if (!test_taint(TAINT_FORCED_MODULE))
printk(KERN_WARNING "%s: %s: kernel tainted.\n",
mod->name, reason);
- add_taint_module(mod, TAINT_FORCED_MODULE);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
return 0;
#else
return -ENOEXEC;
@@ -1786,6 +1803,8 @@ void set_all_modules_text_rw(void)
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if ((mod->module_core) && (mod->core_text_size)) {
set_page_attributes(mod->module_core,
mod->module_core + mod->core_text_size,
@@ -1807,6 +1826,8 @@ void set_all_modules_text_ro(void)
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if ((mod->module_core) && (mod->core_text_size)) {
set_page_attributes(mod->module_core,
mod->module_core + mod->core_text_size,
@@ -2127,7 +2148,8 @@ static void set_license(struct module *mod, const char *license)
if (!test_taint(TAINT_PROPRIETARY_MODULE))
printk(KERN_WARNING "%s: module license '%s' taints "
"kernel.\n", mod->name, license);
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
}
}
@@ -2282,7 +2304,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
Elf_Shdr *symsect = info->sechdrs + info->index.sym;
Elf_Shdr *strsect = info->sechdrs + info->index.str;
const Elf_Sym *src;
- unsigned int i, nsrc, ndst, strtab_size;
+ unsigned int i, nsrc, ndst, strtab_size = 0;
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
@@ -2293,9 +2315,6 @@ static void layout_symtab(struct module *mod, struct load_info *info)
src = (void *)info->hdr + symsect->sh_offset;
nsrc = symsect->sh_size / sizeof(*src);
- /* strtab always starts with a nul, so offset 0 is the empty string. */
- strtab_size = 1;
-
/* Compute total space required for the core symbols' strtab. */
for (ndst = i = 0; i < nsrc; i++) {
if (i == 0 ||
@@ -2337,7 +2356,6 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
mod->core_symtab = dst = mod->module_core + info->symoffs;
mod->core_strtab = s = mod->module_core + info->stroffs;
src = mod->symtab;
- *s++ = 0;
for (ndst = i = 0; i < mod->num_symtab; i++) {
if (i == 0 ||
is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
@@ -2378,7 +2396,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
void * __weak module_alloc(unsigned long size)
{
- return size == 0 ? NULL : vmalloc_exec(size);
+ return vmalloc_exec(size);
}
static void *module_alloc_update_bounds(unsigned long size)
@@ -2425,18 +2443,17 @@ static inline void kmemleak_load_module(const struct module *mod,
#endif
#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info,
- const void *mod, unsigned long *_len)
+static int module_sig_check(struct load_info *info)
{
int err = -ENOKEY;
- unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
- unsigned long len = *_len;
+ const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const void *mod = info->hdr;
- if (len > markerlen &&
- memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ if (info->len > markerlen &&
+ memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
/* We truncate the module to discard the signature */
- *_len -= markerlen;
- err = mod_verify_sig(mod, _len);
+ info->len -= markerlen;
+ err = mod_verify_sig(mod, &info->len);
}
if (!err) {
@@ -2454,59 +2471,114 @@ static int module_sig_check(struct load_info *info,
return err;
}
#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info,
- void *mod, unsigned long *len)
+static int module_sig_check(struct load_info *info)
{
return 0;
}
#endif /* !CONFIG_MODULE_SIG */
-/* Sets info->hdr, info->len and info->sig_ok. */
-static int copy_and_check(struct load_info *info,
- const void __user *umod, unsigned long len,
- const char __user *uargs)
+/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
+static int elf_header_check(struct load_info *info)
+{
+ if (info->len < sizeof(*(info->hdr)))
+ return -ENOEXEC;
+
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
+ || info->hdr->e_type != ET_REL
+ || !elf_check_arch(info->hdr)
+ || info->hdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (info->hdr->e_shoff >= info->len
+ || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+ info->len - info->hdr->e_shoff))
+ return -ENOEXEC;
+
+ return 0;
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+ struct load_info *info)
{
int err;
- Elf_Ehdr *hdr;
- if (len < sizeof(*hdr))
+ info->len = len;
+ if (info->len < sizeof(*(info->hdr)))
return -ENOEXEC;
+ err = security_kernel_module_from_file(NULL);
+ if (err)
+ return err;
+
/* Suck in entire file: we'll want most of it. */
- if ((hdr = vmalloc(len)) == NULL)
+ info->hdr = vmalloc(info->len);
+ if (!info->hdr)
return -ENOMEM;
- if (copy_from_user(hdr, umod, len) != 0) {
- err = -EFAULT;
- goto free_hdr;
+ if (copy_from_user(info->hdr, umod, info->len) != 0) {
+ vfree(info->hdr);
+ return -EFAULT;
}
- err = module_sig_check(info, hdr, &len);
+ return 0;
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_fd(int fd, struct load_info *info)
+{
+ struct file *file;
+ int err;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
+
+ file = fget(fd);
+ if (!file)
+ return -ENOEXEC;
+
+ err = security_kernel_module_from_file(file);
if (err)
- goto free_hdr;
+ goto out;
+
+ err = vfs_getattr(&file->f_path, &stat);
+ if (err)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ err = -EFBIG;
+ goto out;
+ }
- /* Sanity checks against insmoding binaries or wrong arch,
- weird elf version */
- if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
- || hdr->e_type != ET_REL
- || !elf_check_arch(hdr)
- || hdr->e_shentsize != sizeof(Elf_Shdr)) {
- err = -ENOEXEC;
- goto free_hdr;
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ err = -EINVAL;
+ goto out;
}
- if (hdr->e_shoff >= len ||
- hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) {
- err = -ENOEXEC;
- goto free_hdr;
+ info->hdr = vmalloc(stat.size);
+ if (!info->hdr) {
+ err = -ENOMEM;
+ goto out;
}
- info->hdr = hdr;
- info->len = len;
- return 0;
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(info->hdr);
+ err = bytes;
+ goto out;
+ }
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+ info->len = pos;
-free_hdr:
- vfree(hdr);
+out:
+ fput(file);
return err;
}
@@ -2515,7 +2587,7 @@ static void free_copy(struct load_info *info)
vfree(info->hdr);
}
-static int rewrite_section_headers(struct load_info *info)
+static int rewrite_section_headers(struct load_info *info, int flags)
{
unsigned int i;
@@ -2543,7 +2615,10 @@ static int rewrite_section_headers(struct load_info *info)
}
/* Track but don't keep modinfo and version sections. */
- info->index.vers = find_sec(info, "__versions");
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+ info->index.vers = 0; /* Pretend no __versions section! */
+ else
+ info->index.vers = find_sec(info, "__versions");
info->index.info = find_sec(info, ".modinfo");
info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2558,7 +2633,7 @@ static int rewrite_section_headers(struct load_info *info)
* Return the temporary module pointer (we'll replace it with the final
* one when we move the module sections around).
*/
-static struct module *setup_load_info(struct load_info *info)
+static struct module *setup_load_info(struct load_info *info, int flags)
{
unsigned int i;
int err;
@@ -2569,7 +2644,7 @@ static struct module *setup_load_info(struct load_info *info)
info->secstrings = (void *)info->hdr
+ info->sechdrs[info->hdr->e_shstrndx].sh_offset;
- err = rewrite_section_headers(info);
+ err = rewrite_section_headers(info, flags);
if (err)
return ERR_PTR(err);
@@ -2607,11 +2682,14 @@ static struct module *setup_load_info(struct load_info *info)
return mod;
}
-static int check_modinfo(struct module *mod, struct load_info *info)
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
{
const char *modmagic = get_modinfo(info, "vermagic");
int err;
+ if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+ modmagic = NULL;
+
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
err = try_to_force_load(mod, "bad vermagic");
@@ -2624,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info)
}
if (!get_modinfo(info, "intree"))
- add_taint_module(mod, TAINT_OOT_MODULE);
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP);
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
printk(KERN_WARNING "%s: module is from the staging directory,"
" the quality is unknown, you have been warned.\n",
mod->name);
@@ -2741,20 +2819,23 @@ static int move_module(struct module *mod, struct load_info *info)
memset(ptr, 0, mod->core_size);
mod->module_core = ptr;
- ptr = module_alloc_update_bounds(mod->init_size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. This block doesn't need to be
- * scanned as it contains data and code that will be freed
- * after the module is initialized.
- */
- kmemleak_ignore(ptr);
- if (!ptr && mod->init_size) {
- module_free(mod, mod->module_core);
- return -ENOMEM;
- }
- memset(ptr, 0, mod->init_size);
- mod->module_init = ptr;
+ if (mod->init_size) {
+ ptr = module_alloc_update_bounds(mod->init_size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. This block doesn't need to be
+ * scanned as it contains data and code that will be freed
+ * after the module is initialized.
+ */
+ kmemleak_ignore(ptr);
+ if (!ptr) {
+ module_free(mod, mod->module_core);
+ return -ENOMEM;
+ }
+ memset(ptr, 0, mod->init_size);
+ mod->module_init = ptr;
+ } else
+ mod->module_init = NULL;
/* Transfer each section which specifies SHF_ALLOC */
pr_debug("final section addresses:\n");
@@ -2790,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod)
* using GPL-only symbols it needs.
*/
if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE);
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
/* driverloader was caught wrongly pretending to be under GPL */
if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
/* lve claims to be GPL but upstream won't provide source */
if (strcmp(mod->name, "lve") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
@@ -2847,18 +2930,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
return 0;
}
-static struct module *layout_and_allocate(struct load_info *info)
+static struct module *layout_and_allocate(struct load_info *info, int flags)
{
/* Module within temporary copy. */
struct module *mod;
Elf_Shdr *pcpusec;
int err;
- mod = setup_load_info(info);
+ mod = setup_load_info(info, flags);
if (IS_ERR(mod))
return mod;
- err = check_modinfo(mod, info);
+ err = check_modinfo(mod, info, flags);
if (err)
return ERR_PTR(err);
@@ -2938,70 +3021,255 @@ static bool finished_loading(const char *name)
bool ret;
mutex_lock(&module_mutex);
- mod = find_module(name);
- ret = !mod || mod->state != MODULE_STATE_COMING;
+ mod = find_module_all(name, true);
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
return ret;
}
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+ unsigned long i;
+
+ for (i = 0; i < mod->num_ctors; i++)
+ mod->ctors[i]();
+#endif
+}
+
+/* This is where the real work happens */
+static int do_init_module(struct module *mod)
+{
+ int ret = 0;
+
+ /*
+ * We want to find out whether @mod uses async during init. Clear
+ * PF_USED_ASYNC. async_schedule*() will set it.
+ */
+ current->flags &= ~PF_USED_ASYNC;
+
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_COMING, mod);
+
+ /* Set RO and NX regions for core */
+ set_section_ro_nx(mod->module_core,
+ mod->core_text_size,
+ mod->core_ro_size,
+ mod->core_size);
+
+ /* Set RO and NX regions for init */
+ set_section_ro_nx(mod->module_init,
+ mod->init_text_size,
+ mod->init_ro_size,
+ mod->init_size);
+
+ do_mod_ctors(mod);
+ /* Start the module */
+ if (mod->init != NULL)
+ ret = do_one_initcall(mod->init);
+ if (ret < 0) {
+ /* Init routine failed: abort. Try to protect us from
+ buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+ return ret;
+ }
+ if (ret > 0) {
+ printk(KERN_WARNING
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+"%s: loading module anyway...\n",
+ __func__, mod->name, ret,
+ __func__);
+ dump_stack();
+ }
+
+ /* Now it's a first class citizen! */
+ mod->state = MODULE_STATE_LIVE;
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_LIVE, mod);
+
+ /*
+ * We need to finish all async code before the module init sequence
+ * is done. This has potential to deadlock. For example, a newly
+ * detected block device can trigger request_module() of the
+ * default iosched from async probing task. Once userland helper
+ * reaches here, async_synchronize_full() will wait on the async
+ * task waiting on request_module() and deadlock.
+ *
+ * This deadlock is avoided by perfomring async_synchronize_full()
+ * iff module init queued any async jobs. This isn't a full
+ * solution as it will deadlock the same if module loading from
+ * async jobs nests more than once; however, due to the various
+ * constraints, this hack seems to be the best option for now.
+ * Please refer to the following thread for details.
+ *
+ * http://thread.gmane.org/gmane.linux.kernel/1420814
+ */
+ if (current->flags & PF_USED_ASYNC)
+ async_synchronize_full();
+
+ mutex_lock(&module_mutex);
+ /* Drop initial reference. */
+ module_put(mod);
+ trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+ mod->num_symtab = mod->core_num_syms;
+ mod->symtab = mod->core_symtab;
+ mod->strtab = mod->core_strtab;
+#endif
+ unset_module_init_ro_nx(mod);
+ module_free(mod, mod->module_init);
+ mod->module_init = NULL;
+ mod->init_size = 0;
+ mod->init_ro_size = 0;
+ mod->init_text_size = 0;
+ mutex_unlock(&module_mutex);
+ wake_up_all(&module_wq);
+
+ return 0;
+}
+
+static int may_init_module(void)
+{
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ return 0;
+}
+
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources. In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+ int err;
+ struct module *old;
+
+ mod->state = MODULE_STATE_UNFORMED;
+
+again:
+ mutex_lock(&module_mutex);
+ if ((old = find_module_all(mod->name, true)) != NULL) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
+ if (err)
+ goto out_unlocked;
+ goto again;
+ }
+ err = -EEXIST;
+ goto out;
+ }
+ list_add_rcu(&mod->list, &modules);
+ err = 0;
+
+out:
+ mutex_unlock(&module_mutex);
+out_unlocked:
+ return err;
+}
+
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+ int err;
+
+ mutex_lock(&module_mutex);
+
+ /* Find duplicate symbols (must be called under lock). */
+ err = verify_export_symbols(mod);
+ if (err < 0)
+ goto out;
+
+ /* This relies on module_mutex for list integrity. */
+ module_bug_finalize(info->hdr, info->sechdrs, mod);
+
+ /* Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us. */
+ mod->state = MODULE_STATE_COMING;
+
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
-static struct module *load_module(void __user *umod,
- unsigned long len,
- const char __user *uargs)
+static int load_module(struct load_info *info, const char __user *uargs,
+ int flags)
{
- struct load_info info = { NULL, };
- struct module *mod, *old;
+ struct module *mod;
long err;
- pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
- umod, len, uargs);
+ err = module_sig_check(info);
+ if (err)
+ goto free_copy;
- /* Copy in the blobs from userspace, check they are vaguely sane. */
- err = copy_and_check(&info, umod, len, uargs);
+ err = elf_header_check(info);
if (err)
- return ERR_PTR(err);
+ goto free_copy;
/* Figure out module layout, and allocate all the memory. */
- mod = layout_and_allocate(&info);
+ mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
err = PTR_ERR(mod);
goto free_copy;
}
+ /* Reserve our place in the list. */
+ err = add_unformed_module(mod);
+ if (err)
+ goto free_module;
+
#ifdef CONFIG_MODULE_SIG
- mod->sig_ok = info.sig_ok;
- if (!mod->sig_ok)
- add_taint_module(mod, TAINT_FORCED_MODULE);
+ mod->sig_ok = info->sig_ok;
+ if (!mod->sig_ok) {
+ printk_once(KERN_NOTICE
+ "%s: module verification failed: signature and/or"
+ " required key missing - tainting kernel\n",
+ mod->name);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
+ }
#endif
/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
- goto free_module;
+ goto unlink_mod;
/* Now we've got everything in the final locations, we can
* find optional sections. */
- find_module_sections(mod, &info);
+ find_module_sections(mod, info);
err = check_module_license_and_versions(mod);
if (err)
goto free_unload;
/* Set up MODINFO_ATTR fields */
- setup_modinfo(mod, &info);
+ setup_modinfo(mod, info);
/* Fix up syms, so that st_value is a pointer to location. */
- err = simplify_symbols(mod, &info);
+ err = simplify_symbols(mod, info);
if (err < 0)
goto free_modinfo;
- err = apply_relocations(mod, &info);
+ err = apply_relocations(mod, info);
if (err < 0)
goto free_modinfo;
- err = post_relocation(mod, &info);
+ err = post_relocation(mod, info);
if (err < 0)
goto free_modinfo;
@@ -3014,72 +3282,39 @@ static struct module *load_module(void __user *umod,
goto free_arch_cleanup;
}
- /* Mark state as coming so strong_try_module_get() ignores us. */
- mod->state = MODULE_STATE_COMING;
-
- /* Now sew it into the lists so we can get lockdep and oops
- * info during argument parsing. No one should access us, since
- * strong_try_module_get() will fail.
- * lockdep/oops can run asynchronous, so use the RCU list insertion
- * function to insert in a way safe to concurrent readers.
- * The mutex protects against concurrent writers.
- */
-again:
- mutex_lock(&module_mutex);
- if ((old = find_module(mod->name)) != NULL) {
- if (old->state == MODULE_STATE_COMING) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto free_arch_cleanup;
- goto again;
- }
- err = -EEXIST;
- goto unlock;
- }
-
- /* This has to be done once we're sure module name is unique. */
- dynamic_debug_setup(info.debug, info.num_debug);
-
- /* Find duplicate symbols */
- err = verify_export_symbols(mod);
- if (err < 0)
- goto ddebug;
+ dynamic_debug_setup(info->debug, info->num_debug);
- module_bug_finalize(info.hdr, info.sechdrs, mod);
- list_add_rcu(&mod->list, &modules);
- mutex_unlock(&module_mutex);
+ /* Finally it's fully formed, ready to start executing. */
+ err = complete_formation(mod, info);
+ if (err)
+ goto ddebug_cleanup;
/* Module is ready to execute: parsing args may do that. */
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-32768, 32767, &ddebug_dyndbg_module_param_cb);
if (err < 0)
- goto unlink;
+ goto bug_cleanup;
/* Link in to syfs. */
- err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
+ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
if (err < 0)
- goto unlink;
+ goto bug_cleanup;
/* Get rid of temporary copy. */
- free_copy(&info);
+ free_copy(info);
/* Done! */
trace_module_load(mod);
- return mod;
- unlink:
+ return do_init_module(mod);
+
+ bug_cleanup:
+ /* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
- /* Unlink carefully: kallsyms could be walking list. */
- list_del_rcu(&mod->list);
module_bug_cleanup(mod);
- wake_up_all(&module_wq);
- ddebug:
- dynamic_debug_remove(info.debug);
- unlock:
mutex_unlock(&module_mutex);
+ ddebug_cleanup:
+ dynamic_debug_remove(info->debug);
synchronize_sched();
kfree(mod->args);
free_arch_cleanup:
@@ -3088,107 +3323,59 @@ again:
free_modinfo(mod);
free_unload:
module_unload_free(mod);
+ unlink_mod:
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ wake_up_all(&module_wq);
+ mutex_unlock(&module_mutex);
free_module:
- module_deallocate(mod, &info);
+ module_deallocate(mod, info);
free_copy:
- free_copy(&info);
- return ERR_PTR(err);
-}
-
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
- unsigned long i;
-
- for (i = 0; i < mod->num_ctors; i++)
- mod->ctors[i]();
-#endif
+ free_copy(info);
+ return err;
}
-/* This is where the real work happens */
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
{
- struct module *mod;
- int ret = 0;
+ int err;
+ struct load_info info = { };
- /* Must have permission */
- if (!capable(CAP_SYS_MODULE) || modules_disabled)
- return -EPERM;
+ err = may_init_module();
+ if (err)
+ return err;
- /* Do all the hard work */
- mod = load_module(umod, len, uargs);
- if (IS_ERR(mod))
- return PTR_ERR(mod);
+ pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_COMING, mod);
+ err = copy_module_from_user(umod, len, &info);
+ if (err)
+ return err;
- /* Set RO and NX regions for core */
- set_section_ro_nx(mod->module_core,
- mod->core_text_size,
- mod->core_ro_size,
- mod->core_size);
+ return load_module(&info, uargs, 0);
+}
- /* Set RO and NX regions for init */
- set_section_ro_nx(mod->module_init,
- mod->init_text_size,
- mod->init_ro_size,
- mod->init_size);
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+ int err;
+ struct load_info info = { };
- do_mod_ctors(mod);
- /* Start the module */
- if (mod->init != NULL)
- ret = do_one_initcall(mod->init);
- if (ret < 0) {
- /* Init routine failed: abort. Try to protect us from
- buggy refcounters. */
- mod->state = MODULE_STATE_GOING;
- synchronize_sched();
- module_put(mod);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- free_module(mod);
- wake_up_all(&module_wq);
- return ret;
- }
- if (ret > 0) {
- printk(KERN_WARNING
-"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
-"%s: loading module anyway...\n",
- __func__, mod->name, ret,
- __func__);
- dump_stack();
- }
+ err = may_init_module();
+ if (err)
+ return err;
- /* Now it's a first class citizen! */
- mod->state = MODULE_STATE_LIVE;
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_LIVE, mod);
+ pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
- /* We need to finish all async code before the module init sequence is done */
- async_synchronize_full();
+ if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ |MODULE_INIT_IGNORE_VERMAGIC))
+ return -EINVAL;
- mutex_lock(&module_mutex);
- /* Drop initial reference. */
- module_put(mod);
- trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
- mod->num_symtab = mod->core_num_syms;
- mod->symtab = mod->core_symtab;
- mod->strtab = mod->core_strtab;
-#endif
- unset_module_init_ro_nx(mod);
- module_free(mod, mod->module_init);
- mod->module_init = NULL;
- mod->init_size = 0;
- mod->init_ro_size = 0;
- mod->init_text_size = 0;
- mutex_unlock(&module_mutex);
- wake_up_all(&module_wq);
+ err = copy_module_from_fd(fd, &info);
+ if (err)
+ return err;
- return 0;
+ return load_module(&info, uargs, flags);
}
static inline int within(unsigned long addr, void *start, unsigned long size)
@@ -3264,6 +3451,8 @@ const char *module_address_lookup(unsigned long addr,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_init(addr, mod) ||
within_module_core(addr, mod)) {
if (modname)
@@ -3287,6 +3476,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_init(addr, mod) ||
within_module_core(addr, mod)) {
const char *sym;
@@ -3311,6 +3502,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_init(addr, mod) ||
within_module_core(addr, mod)) {
const char *sym;
@@ -3338,6 +3531,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (symnum < mod->num_symtab) {
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
@@ -3380,9 +3575,12 @@ unsigned long module_kallsyms_lookup_name(const char *name)
ret = mod_find_symname(mod, colon+1);
*colon = ':';
} else {
- list_for_each_entry_rcu(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if ((ret = mod_find_symname(mod, name)) != 0)
break;
+ }
}
preempt_enable();
return ret;
@@ -3397,6 +3595,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
int ret;
list_for_each_entry(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
for (i = 0; i < mod->num_symtab; i++) {
ret = fn(data, mod->strtab + mod->symtab[i].st_name,
mod, mod->symtab[i].st_value);
@@ -3412,6 +3612,7 @@ static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
+ BUG_ON(mod->state == MODULE_STATE_UNFORMED);
if (mod->taints ||
mod->state == MODULE_STATE_GOING ||
mod->state == MODULE_STATE_COMING) {
@@ -3453,6 +3654,10 @@ static int m_show(struct seq_file *m, void *p)
struct module *mod = list_entry(p, struct module, list);
char buf[8];
+ /* We always ignore unformed modules. */
+ if (mod->state == MODULE_STATE_UNFORMED)
+ return 0;
+
seq_printf(m, "%s %u",
mod->name, mod->init_size + mod->core_size);
print_unload_info(m, mod);
@@ -3513,6 +3718,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (mod->num_exentries == 0)
continue;
@@ -3561,10 +3768,13 @@ struct module *__module_address(unsigned long addr)
if (addr < module_addr_min || addr > module_addr_max)
return NULL;
- list_for_each_entry_rcu(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_core(addr, mod)
|| within_module_init(addr, mod))
return mod;
+ }
return NULL;
}
EXPORT_SYMBOL_GPL(__module_address);
@@ -3617,8 +3827,11 @@ void print_modules(void)
printk(KERN_DEFAULT "Modules linked in:");
/* Most callers should already have preempt disabled, but make sure */
preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
printk(" %s%s", mod->name, module_flags(mod, buf));
+ }
preempt_enable();
if (last_unloaded_module[0])
printk(" [last unloaded: %s]", last_unloaded_module);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c9526..52f23011b6e0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,6 +19,7 @@
*/
#include <linux/mutex.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 7e1c3de1ce45..afc0456f227a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
* leave it to the caller to do proper locking and attach it to task.
*/
static struct nsproxy *create_new_namespaces(unsigned long flags,
- struct task_struct *tsk, struct fs_struct *new_fs)
+ struct task_struct *tsk, struct user_namespace *user_ns,
+ struct fs_struct *new_fs)
{
struct nsproxy *new_nsp;
int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
if (!new_nsp)
return ERR_PTR(-ENOMEM);
- new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
}
- new_nsp->uts_ns = copy_utsname(flags, tsk);
+ new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
if (IS_ERR(new_nsp->uts_ns)) {
err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;
}
- new_nsp->ipc_ns = copy_ipcs(flags, tsk);
+ new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
if (IS_ERR(new_nsp->ipc_ns)) {
err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;
}
- new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
+ new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
if (IS_ERR(new_nsp->pid_ns)) {
err = PTR_ERR(new_nsp->pid_ns);
goto out_pid;
}
- new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns);
+ new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
goto out_net;
@@ -122,6 +123,7 @@ out_ns:
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
+ struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
int err = 0;
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
CLONE_NEWPID | CLONE_NEWNET)))
return 0;
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
err = -EPERM;
goto out;
}
@@ -151,7 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
goto out;
}
- new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+ new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
if (IS_ERR(new_ns)) {
err = PTR_ERR(new_ns);
goto out;
@@ -183,19 +185,21 @@ void free_nsproxy(struct nsproxy *ns)
* On success, returns the new nsproxy.
*/
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
- struct nsproxy **new_nsp, struct fs_struct *new_fs)
+ struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
+ struct user_namespace *user_ns;
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET)))
+ CLONE_NEWNET | CLONE_NEWPID)))
return 0;
- if (!capable(CAP_SYS_ADMIN))
+ user_ns = new_cred ? new_cred->user_ns : current_user_ns();
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- *new_nsp = create_new_namespaces(unshare_flags, current,
- new_fs ? new_fs : current->fs);
+ *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+ new_fs ? new_fs : current->fs);
if (IS_ERR(*new_nsp)) {
err = PTR_ERR(*new_nsp);
goto out;
@@ -241,20 +245,17 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
struct file *file;
int err;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
file = proc_ns_fget(fd);
if (IS_ERR(file))
return PTR_ERR(file);
err = -EINVAL;
- ei = PROC_I(file->f_dentry->d_inode);
+ ei = PROC_I(file_inode(file));
ops = ei->ns_ops;
if (nstype && (ops->type != nstype))
goto out;
- new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+ new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
if (IS_ERR(new_nsproxy)) {
err = PTR_ERR(new_nsproxy);
goto out;
diff --git a/kernel/padata.c b/kernel/padata.c
index 89fe3d1b9efb..072f4ee4eb89 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
{
int cpu, num_cpus;
unsigned int next_nr, next_index;
- struct padata_parallel_queue *queue, *next_queue;
+ struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
@@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
goto out;
}
- queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
- if (queue->cpu_index == next_queue->cpu_index) {
+ if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
padata = ERR_PTR(-ENODATA);
goto out;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff97..7c57cc9eee2c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -259,26 +259,19 @@ unsigned long get_taint(void)
return tainted_mask;
}
-void add_taint(unsigned flag)
+/**
+ * add_taint: add a taint flag if not already set.
+ * @flag: one of the TAINT_* constants.
+ * @lockdep_ok: whether lock debugging is still OK.
+ *
+ * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
+ * some notewortht-but-not-corrupting cases, it can be set to true.
+ */
+void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
- /*
- * Can't trust the integrity of the kernel anymore.
- * We don't call directly debug_locks_off() because the issue
- * is not necessarily serious enough to set oops_in_progress to 1
- * Also we want to keep up lockdep for staging/out-of-tree
- * development and post-warning case.
- */
- switch (flag) {
- case TAINT_CRAP:
- case TAINT_OOT_MODULE:
- case TAINT_WARN:
- case TAINT_FIRMWARE_WORKAROUND:
- break;
-
- default:
- if (__debug_locks_off())
- printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
- }
+ if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
+ printk(KERN_WARNING
+ "Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask);
}
@@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
print_modules();
dump_stack();
print_oops_end_marker();
- add_taint(taint);
+ /* Just a warning, don't kill lockdep. */
+ add_taint(taint, LOCKDEP_STILL_OK);
}
void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
diff --git a/kernel/pid.c b/kernel/pid.c
index fd996c1ed9f8..047dc6264638 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
+#include <linux/proc_fs.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = {
.last_pid = 0,
.level = 0,
.child_reaper = &init_task,
+ .user_ns = &init_user_ns,
+ .proc_inum = PROC_PID_INIT_INO,
};
EXPORT_SYMBOL_GPL(init_pid_ns);
-int is_container_init(struct task_struct *tsk)
-{
- int ret = 0;
- struct pid *pid;
-
- rcu_read_lock();
- pid = task_pid(tsk);
- if (pid != NULL && pid->numbers[pid->level].nr == 1)
- ret = 1;
- rcu_read_unlock();
-
- return ret;
-}
-EXPORT_SYMBOL(is_container_init);
-
/*
* Note: disable interrupts while the pidmap_lock is held as an
* interrupt might come in and do read_lock(&tasklist_lock).
@@ -269,8 +257,23 @@ void free_pid(struct pid *pid)
unsigned long flags;
spin_lock_irqsave(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- hlist_del_rcu(&pid->numbers[i].pid_chain);
+ for (i = 0; i <= pid->level; i++) {
+ struct upid *upid = pid->numbers + i;
+ struct pid_namespace *ns = upid->ns;
+ hlist_del_rcu(&upid->pid_chain);
+ switch(--ns->nr_hashed) {
+ case 1:
+ /* When all that is left in the pid namespace
+ * is the reaper wake up the reaper. The reaper
+ * may be sleeping in zap_pid_ns_processes().
+ */
+ wake_up_process(ns->child_reaper);
+ break;
+ case 0:
+ schedule_work(&ns->proc_work);
+ break;
+ }
+ }
spin_unlock_irqrestore(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++)
@@ -292,6 +295,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
goto out;
tmp = ns;
+ pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
if (nr < 0)
@@ -302,22 +306,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = tmp->parent;
}
+ if (unlikely(is_child_reaper(pid))) {
+ if (pid_ns_prepare_proc(ns))
+ goto out_free;
+ }
+
get_pid_ns(ns);
- pid->level = ns->level;
atomic_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- for ( ; upid >= pid->numbers; --upid)
+ if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ goto out_unlock;
+ for ( ; upid >= pid->numbers; --upid) {
hlist_add_head_rcu(&upid->pid_chain,
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+ upid->ns->nr_hashed++;
+ }
spin_unlock_irq(&pidmap_lock);
out:
return pid;
+out_unlock:
+ spin_unlock_irq(&pidmap_lock);
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
@@ -327,12 +341,18 @@ out_free:
goto out;
}
+void disable_pid_allocation(struct pid_namespace *ns)
+{
+ spin_lock_irq(&pidmap_lock);
+ ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ spin_unlock_irq(&pidmap_lock);
+}
+
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct hlist_node *elem;
struct upid *pnr;
- hlist_for_each_entry_rcu(pnr, elem,
+ hlist_for_each_entry_rcu(pnr,
&pid_hash[pid_hashfn(nr, ns)], pid_chain)
if (pnr->nr == nr && pnr->ns == ns)
return container_of(pnr, struct pid,
@@ -344,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
struct pid *find_vpid(int nr)
{
- return find_pid_ns(nr, current->nsproxy->pid_ns);
+ return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);
@@ -428,7 +448,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
struct task_struct *find_task_by_vpid(pid_t vnr)
{
- return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+ return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -483,7 +503,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
pid_t pid_vnr(struct pid *pid)
{
- return pid_nr_ns(pid, current->nsproxy->pid_ns);
+ return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);
@@ -494,7 +514,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
rcu_read_lock();
if (!ns)
- ns = current->nsproxy->pid_ns;
+ ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
@@ -558,6 +578,9 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
+ /* Veryify no one has done anything silly */
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
@@ -569,6 +592,7 @@ void __init pidmap_init(void)
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7b07cc0dfb75..c1c3dc1c6023 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,6 +10,7 @@
#include <linux/pid.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <linux/syscalls.h>
#include <linux/err.h>
#include <linux/acct.h>
@@ -71,10 +72,17 @@ err_alloc:
return NULL;
}
+static void proc_cleanup_work(struct work_struct *work)
+{
+ struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
+ pid_ns_release_proc(ns);
+}
+
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32
-static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
+static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
+ struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
@@ -99,9 +107,16 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
if (ns->pid_cachep == NULL)
goto out_free_map;
+ err = proc_alloc_inum(&ns->proc_inum);
+ if (err)
+ goto out_free_map;
+
kref_init(&ns->kref);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
+ ns->user_ns = get_user_ns(user_ns);
+ ns->nr_hashed = PIDNS_HASH_ADDING;
+ INIT_WORK(&ns->proc_work, proc_cleanup_work);
set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -109,14 +124,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
- err = pid_ns_prepare_proc(ns);
- if (err)
- goto out_put_parent_pid_ns;
-
return ns;
-out_put_parent_pid_ns:
- put_pid_ns(parent_pid_ns);
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
@@ -129,18 +138,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
+ proc_free_inum(ns->proc_inum);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
+ put_user_ns(ns->user_ns);
kmem_cache_free(pid_ns_cachep, ns);
}
-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+struct pid_namespace *copy_pid_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
if (!(flags & CLONE_NEWPID))
return get_pid_ns(old_ns);
- if (flags & (CLONE_THREAD|CLONE_PARENT))
+ if (task_active_pid_ns(current) != old_ns)
return ERR_PTR(-EINVAL);
- return create_pid_namespace(old_ns);
+ return create_pid_namespace(user_ns, old_ns);
}
static void free_pid_ns(struct kref *kref)
@@ -170,6 +182,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int rc;
struct task_struct *task, *me = current;
+ /* Don't allow any more processes into the pid namespace */
+ disable_pid_allocation(pid_ns);
+
/* Ignore SIGCHLD causing any terminated children to autoreap */
spin_lock_irq(&me->sighand->siglock);
me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
@@ -211,22 +226,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
/*
* sys_wait4() above can't reap the TASK_DEAD children.
- * Make sure they all go away, see __unhash_process().
+ * Make sure they all go away, see free_pid().
*/
for (;;) {
- bool need_wait = false;
-
- read_lock(&tasklist_lock);
- if (!list_empty(&current->children)) {
- __set_current_state(TASK_UNINTERRUPTIBLE);
- need_wait = true;
- }
- read_unlock(&tasklist_lock);
-
- if (!need_wait)
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (pid_ns->nr_hashed == 1)
break;
schedule();
}
+ __set_current_state(TASK_RUNNING);
if (pid_ns->reboot)
current->signal->group_exit_code = pid_ns->reboot;
@@ -239,9 +247,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
- if (write && !capable(CAP_SYS_ADMIN))
+ if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -250,7 +259,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/
- tmp.data = &current->nsproxy->pid_ns->last_pid;
+ tmp.data = &pid_ns->last_pid;
return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}
@@ -299,6 +308,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
+static void *pidns_get(struct task_struct *task)
+{
+ struct pid_namespace *ns;
+
+ rcu_read_lock();
+ ns = get_pid_ns(task_active_pid_ns(task));
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void pidns_put(void *ns)
+{
+ put_pid_ns(ns);
+}
+
+static int pidns_install(struct nsproxy *nsproxy, void *ns)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *ancestor, *new = ns;
+
+ if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
+ !nsown_capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Only allow entering the current active pid namespace
+ * or a child of the current active pid namespace.
+ *
+ * This is required for fork to return a usable pid value and
+ * this maintains the property that processes and their
+ * children can not escape their current pid namespace.
+ */
+ if (new->level < active->level)
+ return -EINVAL;
+
+ ancestor = new;
+ while (ancestor->level > active->level)
+ ancestor = ancestor->parent;
+ if (ancestor != active)
+ return -EINVAL;
+
+ put_pid_ns(nsproxy->pid_ns);
+ nsproxy->pid_ns = get_pid_ns(new);
+ return 0;
+}
+
+static unsigned int pidns_inum(void *ns)
+{
+ struct pid_namespace *pid_ns = ns;
+ return pid_ns->proc_inum;
+}
+
+const struct proc_ns_operations pidns_operations = {
+ .name = "pid",
+ .type = CLONE_NEWPID,
+ .get = pidns_get,
+ .put = pidns_put,
+ .install = pidns_install,
+ .inum = pidns_inum,
+};
+
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d73840271dce..8fd709c9bb58 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,6 +9,7 @@
#include <asm/uaccess.h>
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
+#include <linux/random.h>
/*
* Called after updating RLIMIT_CPU to run cpu timer and update
@@ -154,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer,
static inline cputime_t prof_ticks(struct task_struct *p)
{
- return p->utime + p->stime;
+ cputime_t utime, stime;
+
+ task_cputime(p, &utime, &stime);
+
+ return utime + stime;
}
static inline cputime_t virt_ticks(struct task_struct *p)
{
- return p->utime;
+ cputime_t utime;
+
+ task_cputime(p, &utime, NULL);
+
+ return utime;
}
static int
@@ -470,16 +479,23 @@ static void cleanup_timers(struct list_head *head,
*/
void posix_cpu_timers_exit(struct task_struct *tsk)
{
+ cputime_t utime, stime;
+
+ add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+ sizeof(unsigned long long));
+ task_cputime(tsk, &utime, &stime);
cleanup_timers(tsk->cpu_timers,
- tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+ utime, stime, tsk->se.sum_exec_runtime);
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
struct signal_struct *const sig = tsk->signal;
+ cputime_t utime, stime;
+ task_cputime(tsk, &utime, &stime);
cleanup_timers(tsk->signal->cpu_timers,
- tsk->utime + sig->utime, tsk->stime + sig->stime,
+ utime + sig->utime, stime + sig->stime,
tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
}
@@ -1223,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
static inline int fastpath_timer_check(struct task_struct *tsk)
{
struct signal_struct *sig;
+ cputime_t utime, stime;
+
+ task_cputime(tsk, &utime, &stime);
if (!task_cputime_zero(&tsk->cputime_expires)) {
struct task_cputime task_sample = {
- .utime = tsk->utime,
- .stime = tsk->stime,
+ .utime = utime,
+ .stime = stime,
.sum_exec_runtime = tsk->se.sum_exec_runtime
};
@@ -1398,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
while (!signal_pending(current)) {
if (timer.it.cpu.expires.sched == 0) {
/*
- * Our timer fired and was reset.
+ * Our timer fired and was reset, below
+ * deletion can not fail.
*/
+ posix_cpu_timer_del(&timer);
spin_unlock_irq(&timer.it_lock);
return 0;
}
@@ -1417,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
* We were interrupted by a signal.
*/
sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
- posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ if (!error) {
+ /*
+ * Timer is now unarmed, deletion can not fail.
+ */
+ posix_cpu_timer_del(&timer);
+ }
spin_unlock_irq(&timer.it_lock);
+ while (error == TIMER_RETRY) {
+ /*
+ * We need to handle case when timer was or is in the
+ * middle of firing. In other cases we already freed
+ * resources.
+ */
+ spin_lock_irq(&timer.it_lock);
+ error = posix_cpu_timer_del(&timer);
+ spin_unlock_irq(&timer.it_lock);
+ }
+
if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
/*
* It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b701..6edbb2c55c22 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
- retry:
- if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
- error = -EAGAIN;
- goto out;
- }
+
+ idr_preload(GFP_KERNEL);
spin_lock_irq(&idr_lock);
- error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
+ error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
spin_unlock_irq(&idr_lock);
- if (error) {
- if (error == -EAGAIN)
- goto retry;
+ idr_preload_end();
+ if (error < 0) {
/*
* Weird looking, but we return EAGAIN if the IDR is
* full (proper POSIX return value for this)
*/
- error = -EAGAIN;
+ if (error == -ENOSPC)
+ error = -EAGAIN;
goto out;
}
+ new_timer_id = error;
it_id_set = IT_ID_SET;
new_timer->it_id = (timer_t) new_timer_id;
@@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
+ /*
+ * timer_t could be any type >= int and we want to make sure any
+ * @timer_id outside positive int range fails lookup.
+ */
+ if ((unsigned long long)timer_id > INT_MAX)
+ return NULL;
+
rcu_read_lock();
timr = idr_find(&posix_timers_id, (int)timer_id);
if (timr) {
@@ -997,7 +1002,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
err = kc->clock_adj(which_clock, &ktx);
- if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+ if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
return -EFAULT;
return err;
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
void queue_up_suspend_work(void)
{
- if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
+ if (autosleep_state > PM_SUSPEND_ON)
queue_work(autosleep_wq, &suspend_work);
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de1..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
- suspend_state_t state = PM_SUSPEND_STANDBY;
+ suspend_state_t state = PM_SUSPEND_MIN;
const char * const *s;
#endif
char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
+#ifdef CONFIG_FREEZER
+static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", freeze_timeout_msecs);
+}
+
+static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ freeze_timeout_msecs = val;
+ return n;
+}
+
+power_attr(pm_freeze_timeout);
+
+#endif /* CONFIG_FREEZER*/
+
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
&pm_print_times_attr.attr,
#endif
#endif
+#ifdef CONFIG_FREEZER
+ &pm_freeze_timeout_attr.attr,
+#endif
NULL,
};
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6f..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
/*
* Timeout for stopping processes
*/
-#define TIMEOUT (20 * HZ)
+unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
do_gettimeofday(&start);
- end_time = jiffies + TIMEOUT;
+ end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
if (!user_only)
freeze_workqueues_begin();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9322ff7eaad6..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
return;
}
- if (delayed_work_pending(&req->work))
- cancel_delayed_work_sync(&req->work);
+ cancel_delayed_work_sync(&req->work);
if (new_value != req->node.prio)
pm_qos_update_target(
@@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
"%s called for unknown object.", __func__))
return;
- if (delayed_work_pending(&req->work))
- cancel_delayed_work_sync(&req->work);
+ cancel_delayed_work_sync(&req->work);
if (new_value != req->node.prio)
pm_qos_update_target(
@@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
return;
}
- if (delayed_work_pending(&req->work))
- cancel_delayed_work_sync(&req->work);
+ cancel_delayed_work_sync(&req->work);
pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
&req->node, PM_QOS_REMOVE_REQ,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..d4feda084a3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
#include "power.h"
const char *const pm_states[PM_SUSPEND_MAX] = {
+ [PM_SUSPEND_FREEZE] = "freeze",
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};
static const struct platform_suspend_ops *suspend_ops;
+static bool need_suspend_ops(suspend_state_t state)
+{
+ return !!(state > PM_SUSPEND_FREEZE);
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static bool suspend_freeze_wake;
+
+static void freeze_begin(void)
+{
+ suspend_freeze_wake = false;
+}
+
+static void freeze_enter(void)
+{
+ wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+}
+
+void freeze_wake(void)
+{
+ suspend_freeze_wake = true;
+ wake_up(&suspend_freeze_wait_head);
+}
+EXPORT_SYMBOL_GPL(freeze_wake);
+
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
bool valid_state(suspend_state_t state)
{
+ if (state == PM_SUSPEND_FREEZE)
+ return true;
/*
- * All states need lowlevel support and need to be valid to the lowlevel
+ * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
+ * support and need to be valid to the lowlevel
* implementation, no valid callback implies that none are valid.
*/
return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
* hibernation). Run suspend notifiers, allocate the "suspend" console and
* freeze processes.
*/
-static int suspend_prepare(void)
+static int suspend_prepare(suspend_state_t state)
{
int error;
- if (!suspend_ops || !suspend_ops->enter)
+ if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
return -EPERM;
pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
{
int error;
- if (suspend_ops->prepare) {
+ if (need_suspend_ops(state) && suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_finish;
}
- if (suspend_ops->prepare_late) {
+ if (need_suspend_ops(state) && suspend_ops->prepare_late) {
error = suspend_ops->prepare_late();
if (error)
goto Platform_wake;
}
+ /*
+ * PM_SUSPEND_FREEZE equals
+ * frozen processes + suspended devices + idle processors.
+ * Thus we should invoke freeze_enter() soon after
+ * all the devices are suspended.
+ */
+ if (state == PM_SUSPEND_FREEZE) {
+ freeze_enter();
+ goto Platform_wake;
+ }
+
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
enable_nonboot_cpus();
Platform_wake:
- if (suspend_ops->wake)
+ if (need_suspend_ops(state) && suspend_ops->wake)
suspend_ops->wake();
dpm_resume_start(PMSG_RESUME);
Platform_finish:
- if (suspend_ops->finish)
+ if (need_suspend_ops(state) && suspend_ops->finish)
suspend_ops->finish();
return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
int error;
bool wakeup = false;
- if (!suspend_ops)
+ if (need_suspend_ops(state) && !suspend_ops)
return -ENOSYS;
trace_machine_suspend(state);
- if (suspend_ops->begin) {
+ if (need_suspend_ops(state) && suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
do {
error = suspend_enter(state, &wakeup);
- } while (!error && !wakeup
+ } while (!error && !wakeup && need_suspend_ops(state)
&& suspend_ops->suspend_again && suspend_ops->suspend_again());
Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
ftrace_start();
resume_console();
Close:
- if (suspend_ops->end)
+ if (need_suspend_ops(state) && suspend_ops->end)
suspend_ops->end();
trace_machine_suspend(PWR_EVENT_EXIT);
return error;
Recover_platform:
- if (suspend_ops->recover)
+ if (need_suspend_ops(state) && suspend_ops->recover)
suspend_ops->recover();
goto Resume_devices;
}
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
+ if (state == PM_SUSPEND_FREEZE)
+ freeze_begin();
+
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
- error = suspend_prepare();
+ error = suspend_prepare(state);
if (error)
goto Unlock;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e450ac7..9b2a1d58558d 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
rtc_set_alarm(rtc, &alm);
}
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
+static int __init has_wakealarm(struct device *dev, const void *data)
{
struct rtc_device *candidate = to_rtc_device(dev);
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
if (!device_may_wakeup(candidate->dev.parent))
return 0;
- *(const char **)name_ptr = dev_name(dev);
return 1;
}
@@ -159,8 +158,8 @@ static int __init test_suspend(void)
static char warn_no_rtc[] __initdata =
KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
- char *pony = NULL;
struct rtc_device *rtc = NULL;
+ struct device *dev;
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
@@ -171,9 +170,9 @@ static int __init test_suspend(void)
}
/* RTCs have initialized by now too ... can we use one? */
- class_find_device(rtc_class, NULL, &pony, has_wakealarm);
- if (pony)
- rtc = rtc_class_open(pony);
+ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
+ if (dev)
+ rtc = rtc_class_open(dev_name(dev));
if (!rtc) {
printk(warn_no_rtc);
goto done;
diff --git a/kernel/printk.c b/kernel/printk.c
index 22e070f3470a..0b31715f335a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -42,6 +42,7 @@
#include <linux/notifier.h>
#include <linux/rculist.h>
#include <linux/poll.h>
+#include <linux/irq_work.h>
#include <asm/uaccess.h>
@@ -747,6 +748,21 @@ void __init setup_log_buf(int early)
free, (free * 100) / __LOG_BUF_LEN);
}
+static bool __read_mostly ignore_loglevel;
+
+static int __init ignore_loglevel_setup(char *str)
+{
+ ignore_loglevel = 1;
+ printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+
+ return 0;
+}
+
+early_param("ignore_loglevel", ignore_loglevel_setup);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
+ "print all kernel messages to the console.");
+
#ifdef CONFIG_BOOT_PRINTK_DELAY
static int boot_delay; /* msecs delay after each printk during bootup */
@@ -770,13 +786,15 @@ static int __init boot_delay_setup(char *str)
}
__setup("boot_delay=", boot_delay_setup);
-static void boot_delay_msec(void)
+static void boot_delay_msec(int level)
{
unsigned long long k;
unsigned long timeout;
- if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ || (level >= console_loglevel && !ignore_loglevel)) {
return;
+ }
k = (unsigned long long)loops_per_msec * boot_delay;
@@ -795,7 +813,7 @@ static void boot_delay_msec(void)
}
}
#else
-static inline void boot_delay_msec(void)
+static inline void boot_delay_msec(int level)
{
}
#endif
@@ -853,10 +871,11 @@ static size_t print_time(u64 ts, char *buf)
if (!printk_time)
return 0;
+ rem_nsec = do_div(ts, 1000000000);
+
if (!buf)
- return 15;
+ return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
- rem_nsec = do_div(ts, 1000000000);
return sprintf(buf, "[%5lu.%06lu] ",
(unsigned long)ts, rem_nsec / 1000);
}
@@ -1238,21 +1257,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
}
-static bool __read_mostly ignore_loglevel;
-
-static int __init ignore_loglevel_setup(char *str)
-{
- ignore_loglevel = 1;
- printk(KERN_INFO "debug: ignoring loglevel setting.\n");
-
- return 0;
-}
-
-early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
- "print all kernel messages to the console.");
-
/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
@@ -1498,7 +1502,7 @@ asmlinkage int vprintk_emit(int facility, int level,
int this_cpu;
int printed_len = 0;
- boot_delay_msec();
+ boot_delay_msec(level);
printk_delay();
/* This stops the holder of console_sem just where we want him */
@@ -1964,30 +1968,32 @@ int is_console_locked(void)
static DEFINE_PER_CPU(int, printk_pending);
static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
-void printk_tick(void)
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
- if (__this_cpu_read(printk_pending)) {
- int pending = __this_cpu_xchg(printk_pending, 0);
- if (pending & PRINTK_PENDING_SCHED) {
- char *buf = __get_cpu_var(printk_sched_buf);
- printk(KERN_WARNING "[sched_delayed] %s", buf);
- }
- if (pending & PRINTK_PENDING_WAKEUP)
- wake_up_interruptible(&log_wait);
+ int pending = __this_cpu_xchg(printk_pending, 0);
+
+ if (pending & PRINTK_PENDING_SCHED) {
+ char *buf = __get_cpu_var(printk_sched_buf);
+ printk(KERN_WARNING "[sched_delayed] %s", buf);
}
-}
-int printk_needs_cpu(int cpu)
-{
- if (cpu_is_offline(cpu))
- printk_tick();
- return __this_cpu_read(printk_pending);
+ if (pending & PRINTK_PENDING_WAKEUP)
+ wake_up_interruptible(&log_wait);
}
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+ .func = wake_up_klogd_work_func,
+ .flags = IRQ_WORK_LAZY,
+};
+
void wake_up_klogd(void)
{
- if (waitqueue_active(&log_wait))
+ preempt_disable();
+ if (waitqueue_active(&log_wait)) {
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ }
+ preempt_enable();
}
static void console_cont_flush(char *text, size_t size)
@@ -2468,6 +2474,7 @@ int printk_sched(const char *fmt, ...)
va_end(args);
__this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
+ irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
local_irq_restore(flags);
return r;
diff --git a/kernel/profile.c b/kernel/profile.c
index 1f391819c42f..dc3384ee874e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -37,9 +37,6 @@ struct profile_hit {
#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
-/* Oprofile timer tick hook */
-static int (*timer_hook)(struct pt_regs *) __read_mostly;
-
static atomic_t *prof_buffer;
static unsigned long prof_len, prof_shift;
@@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
}
EXPORT_SYMBOL_GPL(profile_event_unregister);
-int register_timer_hook(int (*hook)(struct pt_regs *))
-{
- if (timer_hook)
- return -EBUSY;
- timer_hook = hook;
- return 0;
-}
-EXPORT_SYMBOL_GPL(register_timer_hook);
-
-void unregister_timer_hook(int (*hook)(struct pt_regs *))
-{
- WARN_ON(hook != timer_hook);
- timer_hook = NULL;
- /* make sure all CPUs see the NULL hook */
- synchronize_sched(); /* Allow ongoing interrupts to complete. */
-}
-EXPORT_SYMBOL_GPL(unregister_timer_hook);
-
-
#ifdef CONFIG_SMP
/*
* Each cpu has a pair of open-addressed hashtables for pending
@@ -436,8 +414,6 @@ void profile_tick(int type)
{
struct pt_regs *regs = get_irq_regs();
- if (type == CPU_PROFILING && timer_hook)
- timer_hook(regs);
if (!user_mode(regs) && prof_cpu_mask != NULL &&
cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
profile_hit(type, (void *)profile_pc(regs));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f5e55dda955..acbd28424d81 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -117,11 +117,45 @@ void __ptrace_unlink(struct task_struct *child)
* TASK_KILLABLE sleeps.
*/
if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
- signal_wake_up(child, task_is_traced(child));
+ ptrace_signal_wake_up(child, true);
spin_unlock(&child->sighand->siglock);
}
+/* Ensure that nothing can wake it up, even SIGKILL */
+static bool ptrace_freeze_traced(struct task_struct *task)
+{
+ bool ret = false;
+
+ /* Lockless, nobody but us can set this flag */
+ if (task->jobctl & JOBCTL_LISTENING)
+ return ret;
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+ task->state = __TASK_TRACED;
+ ret = true;
+ }
+ spin_unlock_irq(&task->sighand->siglock);
+
+ return ret;
+}
+
+static void ptrace_unfreeze_traced(struct task_struct *task)
+{
+ if (task->state != __TASK_TRACED)
+ return;
+
+ WARN_ON(!task->ptrace || task->parent != current);
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (__fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+ else
+ task->state = TASK_TRACED;
+ spin_unlock_irq(&task->sighand->siglock);
+}
+
/**
* ptrace_check_attach - check whether ptracee is ready for ptrace operation
* @child: ptracee to check for
@@ -139,7 +173,7 @@ void __ptrace_unlink(struct task_struct *child)
* RETURNS:
* 0 on success, -ESRCH if %child is not ready.
*/
-int ptrace_check_attach(struct task_struct *child, bool ignore_state)
+static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
{
int ret = -ESRCH;
@@ -151,24 +185,29 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
* be changed by us so it's not changing right after this.
*/
read_lock(&tasklist_lock);
- if ((child->ptrace & PT_PTRACED) && child->parent == current) {
+ if (child->ptrace && child->parent == current) {
+ WARN_ON(child->state == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
*/
- spin_lock_irq(&child->sighand->siglock);
- WARN_ON_ONCE(task_is_stopped(child));
- if (ignore_state || (task_is_traced(child) &&
- !(child->jobctl & JOBCTL_LISTENING)))
+ if (ignore_state || ptrace_freeze_traced(child))
ret = 0;
- spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
- if (!ret && !ignore_state)
- ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
+ if (!ret && !ignore_state) {
+ if (!wait_task_inactive(child, __TASK_TRACED)) {
+ /*
+ * This can only happen if may_ptrace_stop() fails and
+ * ptrace_stop() changes ->state back to TASK_RUNNING,
+ * so we should not worry about leaking __TASK_TRACED.
+ */
+ WARN_ON(child->state == __TASK_TRACED);
+ ret = -ESRCH;
+ }
+ }
- /* All systems go.. */
return ret;
}
@@ -215,8 +254,12 @@ ok:
smp_rmb();
if (task->mm)
dumpable = get_dumpable(task->mm);
- if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode))
+ rcu_read_lock();
+ if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+ rcu_read_unlock();
return -EPERM;
+ }
+ rcu_read_unlock();
return security_ptrace_access_check(task, mode);
}
@@ -280,8 +323,10 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize)
flags |= PT_SEIZED;
- if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
+ rcu_read_lock();
+ if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
flags |= PT_PTRACE_CAP;
+ rcu_read_unlock();
task->ptrace = flags;
__ptrace_link(task, current);
@@ -311,7 +356,7 @@ static int ptrace_attach(struct task_struct *task, long request,
*/
if (task_is_stopped(task) &&
task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
- signal_wake_up(task, 1);
+ signal_wake_up_state(task, __TASK_STOPPED);
spin_unlock(&task->sighand->siglock);
@@ -457,6 +502,9 @@ void exit_ptrace(struct task_struct *tracer)
return;
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+ if (unlikely(p->ptrace & PT_EXITKILL))
+ send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
+
if (__ptrace_detach(tracer, p))
list_add(&p->ptrace_entry, &ptrace_dead);
}
@@ -664,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
kiov->iov_len, kiov->iov_base);
}
+/*
+ * This is declared in linux/regset.h and defined in machine-dependent
+ * code. We put the export here, near the primary machine-neutral use,
+ * to ensure no machine forgets it.
+ */
+EXPORT_SYMBOL_GPL(task_user_regset_view);
#endif
int ptrace_request(struct task_struct *child, long request,
@@ -728,7 +782,7 @@ int ptrace_request(struct task_struct *child, long request,
* tracee into STOP.
*/
if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
- signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+ ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
unlock_task_sighand(child, &flags);
ret = 0;
@@ -754,7 +808,7 @@ int ptrace_request(struct task_struct *child, long request,
* start of this trap and now. Trigger re-trap.
*/
if (child->jobctl & JOBCTL_TRAP_NOTIFY)
- signal_wake_up(child, true);
+ ptrace_signal_wake_up(child, true);
ret = 0;
}
unlock_task_sighand(child, &flags);
@@ -891,6 +945,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out_put_task_struct;
ret = arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
out_put_task_struct:
put_task_struct(child);
@@ -1030,8 +1086,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
ret = ptrace_check_attach(child, request == PTRACE_KILL ||
request == PTRACE_INTERRUPT);
- if (!ret)
+ if (!ret) {
ret = compat_arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
+ }
out_put_task_struct:
put_task_struct(child);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 20dfba576c2b..7f8e7590e3e5 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
extern int rcu_expedited;
+#ifdef CONFIG_RCU_STALL_COMMON
+
+extern int rcu_cpu_stall_suppress;
+int rcu_jiffies_till_stall_check(void);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf76177b44..48ab70384a4c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old, unsigned long c)
{
- trace_rcu_torture_read(rcutorturename, rhp);
+ trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
}
EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#else
-#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+ do { } while (0)
#endif
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA 0
+#endif
+
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+
+module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
+
+int rcu_jiffies_till_stall_check(void)
+{
+ int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+
+ /*
+ * Limit check must be consistent with the Kconfig limits
+ * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+ */
+ if (till_stall_check < 3) {
+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+ till_stall_check = 3;
+ } else if (till_stall_check > 300) {
+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+ till_stall_check = 300;
+ }
+ return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+ rcu_cpu_stall_suppress = 1;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+ .notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+ return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58f9c2a..a0714a51b6d7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
-#include "rcutiny_plugin.h"
-
static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+#include "rcutiny_plugin.h"
+
/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
static void rcu_idle_enter_common(long long newval)
{
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
* interrupts don't count, we must be running at the first interrupt
* level.
*/
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
{
return rcu_dynticks_nesting <= 1;
}
@@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void)
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
+ reset_cpu_stall_ticks(rcp);
if (rcp->rcucblist != NULL &&
rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu)
*/
void rcu_check_callbacks(int cpu, int user)
{
+ check_cpu_stalls();
if (user || rcu_is_cpu_rrupt_from_idle())
rcu_sched_qs(cpu);
else if (!in_softirq())
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a2309b..8a233002faeb 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -33,6 +33,9 @@ struct rcu_ctrlblk {
struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
struct rcu_head **curtail; /* ->next pointer of last CB. */
RCU_TRACE(long qlen); /* Number of pending CBs. */
+ RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
+ RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
+ RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
RCU_TRACE(char *name); /* Name of RCU type. */
};
@@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_RCU_TRACE
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ unsigned long j;
+ unsigned long js;
+
+ if (rcu_cpu_stall_suppress)
+ return;
+ rcp->ticks_this_gp++;
+ j = jiffies;
+ js = rcp->jiffies_stall;
+ if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+ pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
+ rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+ jiffies - rcp->gp_start, rcp->qlen);
+ dump_stack();
+ }
+ if (*rcp->curtail && ULONG_CMP_GE(j, js))
+ rcp->jiffies_stall = jiffies +
+ 3 * rcu_jiffies_till_stall_check() + 3;
+ else if (ULONG_CMP_GE(j, js))
+ rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+}
+
+static void check_cpu_stall_preempt(void);
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
+{
+#ifdef CONFIG_RCU_TRACE
+ rcp->ticks_this_gp = 0;
+ rcp->gp_start = jiffies;
+ rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+#endif /* #ifdef CONFIG_RCU_TRACE */
+}
+
+static void check_cpu_stalls(void)
+{
+ RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+ RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+ RCU_TRACE(check_cpu_stall_preempt());
+}
+
#ifdef CONFIG_TINY_PREEMPT_RCU
#include <linux/delay.h>
@@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void)
/* Official start of GP. */
rcu_preempt_ctrlblk.gpnum++;
RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
+ reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
/* Any blocked RCU readers block new GP. */
if (rcu_preempt_blocked_readers_any())
@@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney");
MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
MODULE_LICENSE("GPL");
+static void check_cpu_stall_preempt(void)
+{
+#ifdef CONFIG_TINY_PREEMPT_RCU
+ check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
+#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+}
+
#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 31dea01c85fd..e1f3a8c96724 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -46,6 +46,7 @@
#include <linux/stat.h>
#include <linux/srcu.h>
#include <linux/slab.h>
+#include <linux/trace_clock.h>
#include <asm/byteorder.h>
MODULE_LICENSE("GPL");
@@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
#define rcu_can_boost() 0
#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+#ifdef CONFIG_RCU_TRACE
+static u64 notrace rcu_trace_clock_local(void)
+{
+ u64 ts = trace_clock_local();
+ unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+ return ts;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static u64 notrace rcu_trace_clock_local(void)
+{
+ return 0ULL;
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
static unsigned long shutdown_time; /* jiffies to system shutdown. */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg)
/* Wait for the next test interval. */
oldstarttime = boost_starttime;
while (ULONG_CMP_LT(jiffies, oldstarttime)) {
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_interruptible(oldstarttime - jiffies);
rcu_stutter_wait("rcu_torture_boost");
if (kthread_should_stop() ||
fullstop != FULLSTOP_DONTSTOP)
@@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void)
return;
if (atomic_xchg(&beenhere, 1) != 0)
return;
- do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
ftrace_dump(DUMP_ALL);
}
@@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused)
{
int idx;
int completed;
+ int completed_end;
static DEFINE_RCU_RANDOM(rand);
static DEFINE_SPINLOCK(rand_lock);
struct rcu_torture *p;
int pipe_count;
+ unsigned long long ts;
idx = cur_ops->readlock();
completed = cur_ops->completed();
+ ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
@@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused)
cur_ops->readunlock(idx);
return;
}
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
spin_lock(&rand_lock);
@@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- if (pipe_count > 1)
+ completed_end = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
+ completed, completed_end);
rcutorture_trace_dump();
+ }
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = cur_ops->completed() - completed;
+ completed = completed_end - completed;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -1094,11 +1114,13 @@ static int
rcu_torture_reader(void *arg)
{
int completed;
+ int completed_end;
int idx;
DEFINE_RCU_RANDOM(rand);
struct rcu_torture *p;
int pipe_count;
struct timer_list t;
+ unsigned long long ts;
VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
set_user_nice(current, 19);
@@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg)
}
idx = cur_ops->readlock();
completed = cur_ops->completed();
+ ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
@@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg)
schedule_timeout_interruptible(HZ);
continue;
}
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
cur_ops->read_delay(&rand);
@@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- if (pipe_count > 1)
+ completed_end = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
+ ts, completed, completed_end);
rcutorture_trace_dump();
+ }
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = cur_ops->completed() - completed;
+ completed = completed_end - completed;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void)
set_cpus_allowed_ptr(reader_tasks[i],
shuffle_tmp_mask);
}
-
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
if (fakewriter_tasks[i])
set_cpus_allowed_ptr(fakewriter_tasks[i],
shuffle_tmp_mask);
}
-
if (writer_task)
set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
-
if (stats_task)
set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
+ if (stutter_task)
+ set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
+ if (fqs_task)
+ set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
+ if (shutdown_task)
+ set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+ if (onoff_task)
+ set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+ if (stall_task)
+ set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
+ if (barrier_cbs_tasks)
+ for (i = 0; i < n_barrier_cbs; i++)
+ if (barrier_cbs_tasks[i])
+ set_cpus_allowed_ptr(barrier_cbs_tasks[i],
+ shuffle_tmp_mask);
+ if (barrier_task)
+ set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
if (rcu_idle_cpu == -1)
rcu_idle_cpu = num_online_cpus() - 1;
@@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void)
barrier_cbs_wq =
kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
GFP_KERNEL);
- if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+ if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
return -ENOMEM;
for (i = 0; i < n_barrier_cbs; i++) {
init_waitqueue_head(&barrier_cbs_wq[i]);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77b614e..5b8ad827fd86 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
* can assume that there is but one task, allowing RCU to (for example)
- * optimized synchronize_sched() to a simple barrier(). When this variable
+ * optimize synchronize_sched() to a simple barrier(). When this variable
* is one, RCU must actually do all the hard work required to detect real
* grace periods. This variable is also used to suppress boot-time false
* positives from lockdep-RCU error checking.
@@ -217,12 +217,6 @@ module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-
-module_param(rcu_cpu_stall_suppress, int, 0644);
-module_param(rcu_cpu_stall_timeout, int, 0644);
-
static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
@@ -305,17 +299,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
}
/*
- * Does the current CPU require a yet-as-unscheduled grace period?
+ * Does the current CPU require a not-yet-started grace period?
+ * The caller must have disabled interrupts to prevent races with
+ * normal callback registry.
*/
static int
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- struct rcu_head **ntp;
+ int i;
- ntp = rdp->nxttail[RCU_DONE_TAIL +
- (ACCESS_ONCE(rsp->completed) != rdp->completed)];
- return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
- !rcu_gp_in_progress(rsp);
+ if (rcu_gp_in_progress(rsp))
+ return 0; /* No, a grace period is already in progress. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL])
+ return 0; /* No, this is a no-CBs (or offline) CPU. */
+ if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+ return 1; /* Yes, this CPU has newly registered callbacks. */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
+ ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+ rdp->nxtcompleted[i]))
+ return 1; /* Yes, CBs for future grace period. */
+ return 0; /* No grace period needed. */
}
/*
@@ -336,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
bool user)
{
- trace_rcu_dyntick("Start", oldval, 0);
+ trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
struct task_struct *idle = idle_task(smp_processor_id());
@@ -727,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
* interrupt from idle, return true. The caller must have at least
* disabled preemption.
*/
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
{
return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
}
@@ -793,28 +797,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
return 0;
}
-static int jiffies_till_stall_check(void)
-{
- int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
-
- /*
- * Limit check must be consistent with the Kconfig limits
- * for CONFIG_RCU_CPU_STALL_TIMEOUT.
- */
- if (till_stall_check < 3) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
- till_stall_check = 3;
- } else if (till_stall_check > 300) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
- till_stall_check = 300;
- }
- return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-
static void record_gp_stall_check_time(struct rcu_state *rsp)
{
rsp->gp_start = jiffies;
- rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
+ rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
}
/*
@@ -857,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
+ rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -935,7 +921,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
raw_spin_lock_irqsave(&rnp->lock, flags);
if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
rsp->jiffies_stall = jiffies +
- 3 * jiffies_till_stall_check() + 3;
+ 3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
set_need_resched(); /* kick ourselves to get things going. */
@@ -966,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
}
}
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
- rcu_cpu_stall_suppress = 1;
- return NOTIFY_DONE;
-}
-
/**
* rcu_cpu_stall_reset - prevent further stall warnings in current grace period
*
@@ -989,15 +969,6 @@ void rcu_cpu_stall_reset(void)
rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
}
-static struct notifier_block rcu_panic_block = {
- .notifier_call = rcu_panic,
-};
-
-static void __init check_cpu_stall_init(void)
-{
- atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
-}
-
/*
* Update CPU-local rcu_data state to record the newly noticed grace period.
* This is used both when we started the grace period and when we notice
@@ -1071,6 +1042,145 @@ static void init_callback_list(struct rcu_data *rdp)
}
/*
+ * Determine the value that ->completed will have at the end of the
+ * next subsequent grace period. This is used to tag callbacks so that
+ * a CPU can invoke callbacks in a timely fashion even if that CPU has
+ * been dyntick-idle for an extended period with callbacks under the
+ * influence of RCU_FAST_NO_HZ.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ /*
+ * If RCU is idle, we just wait for the next grace period.
+ * But we can only be sure that RCU is idle if we are looking
+ * at the root rcu_node structure -- otherwise, a new grace
+ * period might have started, but just not yet gotten around
+ * to initializing the current non-root rcu_node structure.
+ */
+ if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
+ return rnp->completed + 1;
+
+ /*
+ * Otherwise, wait for a possible partial grace period and
+ * then the subsequent full grace period.
+ */
+ return rnp->completed + 2;
+}
+
+/*
+ * If there is room, assign a ->completed number to any callbacks on
+ * this CPU that have not already been assigned. Also accelerate any
+ * callbacks that were previously assigned a ->completed number that has
+ * since proven to be too conservative, which can happen if callbacks get
+ * assigned a ->completed number while RCU is idle, but with reference to
+ * a non-root rcu_node structure. This function is idempotent, so it does
+ * not hurt to call it repeatedly.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ unsigned long c;
+ int i;
+
+ /* If the CPU has no callbacks, nothing to do. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ return;
+
+ /*
+ * Starting from the sublist containing the callbacks most
+ * recently assigned a ->completed number and working down, find the
+ * first sublist that is not assignable to an upcoming grace period.
+ * Such a sublist has something in it (first two tests) and has
+ * a ->completed number assigned that will complete sooner than
+ * the ->completed number for newly arrived callbacks (last test).
+ *
+ * The key point is that any later sublist can be assigned the
+ * same ->completed number as the newly arrived callbacks, which
+ * means that the callbacks in any of these later sublist can be
+ * grouped into a single sublist, whether or not they have already
+ * been assigned a ->completed number.
+ */
+ c = rcu_cbs_completed(rsp, rnp);
+ for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
+ if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
+ !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
+ break;
+
+ /*
+ * If there are no sublist for unassigned callbacks, leave.
+ * At the same time, advance "i" one sublist, so that "i" will
+ * index into the sublist where all the remaining callbacks should
+ * be grouped into.
+ */
+ if (++i >= RCU_NEXT_TAIL)
+ return;
+
+ /*
+ * Assign all subsequent callbacks' ->completed number to the next
+ * full grace period and group them all in the sublist initially
+ * indexed by "i".
+ */
+ for (; i <= RCU_NEXT_TAIL; i++) {
+ rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
+ rdp->nxtcompleted[i] = c;
+ }
+
+ /* Trace depending on how much we were able to accelerate. */
+ if (!*rdp->nxttail[RCU_WAIT_TAIL])
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
+ else
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
+}
+
+/*
+ * Move any callbacks whose grace period has completed to the
+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
+ * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
+ * sublist. This function is idempotent, so it does not hurt to
+ * invoke it repeatedly. As long as it is not invoked -too- often...
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ int i, j;
+
+ /* If the CPU has no callbacks, nothing to do. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ return;
+
+ /*
+ * Find all callbacks whose ->completed numbers indicate that they
+ * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+ */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+ if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
+ break;
+ rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
+ }
+ /* Clean up any sublist tail pointers that were misordered above. */
+ for (j = RCU_WAIT_TAIL; j < i; j++)
+ rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
+
+ /* Copy down callbacks to fill in empty sublists. */
+ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+ if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
+ break;
+ rdp->nxttail[j] = rdp->nxttail[i];
+ rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
+ }
+
+ /* Classify any remaining callbacks. */
+ rcu_accelerate_cbs(rsp, rnp, rdp);
+}
+
+/*
* Advance this CPU's callbacks, but only if the current grace period
* has ended. This may be called only from the CPU to whom the rdp
* belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1080,12 +1190,15 @@ static void
__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
{
/* Did another grace period end? */
- if (rdp->completed != rnp->completed) {
+ if (rdp->completed == rnp->completed) {
- /* Advance callbacks. No harm if list empty. */
- rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ /* No, so just accelerate recent callbacks. */
+ rcu_accelerate_cbs(rsp, rnp, rdp);
+
+ } else {
+
+ /* Advance callbacks. */
+ rcu_advance_cbs(rsp, rnp, rdp);
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
@@ -1392,17 +1505,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
/*
* Because there is no grace period in progress right now,
* any callbacks we have up to this point will be satisfied
- * by the next grace period. So promote all callbacks to be
- * handled after the end of the next grace period. If the
- * CPU is not yet aware of the end of the previous grace period,
- * we need to allow for the callback advancement that will
- * occur when it does become aware. Deadlock prevents us from
- * making it aware at this point: We cannot acquire a leaf
- * rcu_node ->lock while holding the root rcu_node ->lock.
+ * by the next grace period. So this is a good place to
+ * assign a grace period number to recently posted callbacks.
*/
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
- if (rdp->completed == rsp->completed)
- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ rcu_accelerate_cbs(rsp, rnp, rdp);
rsp->gp_flags = RCU_GP_FLAG_INIT;
raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
@@ -1527,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ rcu_accelerate_cbs(rsp, rnp, rdp);
rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
}
@@ -1779,7 +1885,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
long bl, count, count_lazy;
int i;
- /* If no callbacks are ready, just return.*/
+ /* If no callbacks are ready, just return. */
if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
@@ -2008,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp)
WARN_ON_ONCE(rdp->beenonline == 0);
- /*
- * Advance callbacks in response to end of earlier grace
- * period that some other CPU ended.
- */
+ /* Handle the end of a grace period that some other CPU ended. */
rcu_process_gp_end(rsp, rdp);
/* Update RCU state based on any recent quiescent states. */
rcu_check_quiescent_state(rsp, rdp);
/* Does this CPU require a not-yet-started grace period? */
+ local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+ raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
rcu_start_gp(rsp, flags); /* releases above lock */
+ } else {
+ local_irq_restore(flags);
}
/* If there are callbacks ready, invoke them. */
@@ -2719,9 +2825,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
-#ifdef CONFIG_RCU_USER_QS
- WARN_ON_ONCE(rdp->dynticks->in_user);
-#endif
rdp->cpu = cpu;
rdp->rsp = rsp;
rcu_boot_init_nocb_percpu_data(rdp);
@@ -2938,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp,
BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
+ /* Silence gcc 4.8 warning about array index out of range. */
+ if (rcu_num_lvls > RCU_NUM_LVLS)
+ panic("rcu_init_one: rcu_num_lvls overflow");
+
/* Initialize the level-tracking arrays. */
for (i = 0; i < rcu_num_lvls; i++)
@@ -3074,7 +3181,6 @@ void __init rcu_init(void)
cpu_notifier(rcu_cpu_notify, 0);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- check_cpu_stall_init();
}
#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291b093d..c896b5045d9d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,10 +102,6 @@ struct rcu_dynticks {
/* idle-period nonlazy_posted snapshot. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-#ifdef CONFIG_RCU_USER_QS
- bool ignore_user_qs; /* Treat userspace as extended QS or not */
- bool in_user; /* Is the CPU in userland from RCU POV? */
-#endif
};
/* RCU's kthread states for tracing. */
@@ -282,6 +278,8 @@ struct rcu_data {
*/
struct rcu_head *nxtlist;
struct rcu_head **nxttail[RCU_NEXT_SIZE];
+ unsigned long nxtcompleted[RCU_NEXT_SIZE];
+ /* grace periods for sublists. */
long qlen_lazy; /* # of lazy queued callbacks */
long qlen; /* # of queued callbacks, incl lazy */
long qlen_last_fqs_check;
@@ -343,11 +341,6 @@ struct rcu_data {
#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA 0
-#endif
#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
/* to take at least one */
/* scheduling clock irq */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f6e5ec2932b4..c1cc7e17ff9d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -40,8 +40,7 @@
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
-static bool rcu_nocb_poll; /* Offload kthread are to poll. */
-module_param(rcu_nocb_poll, bool, 0444);
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
static char __initdata nocb_buf[NR_CPUS * 5];
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
@@ -2159,6 +2158,13 @@ static int __init rcu_nocb_setup(char *str)
}
__setup("rcu_nocbs=", rcu_nocb_setup);
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = 1;
+ return 0;
+}
+early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+
/* Is the specified CPU a no-CPUs CPU? */
static bool is_nocb_cpu(int cpu)
{
@@ -2366,10 +2372,11 @@ static int rcu_nocb_kthread(void *arg)
for (;;) {
/* If not polling, wait for next batch of callbacks. */
if (!rcu_nocb_poll)
- wait_event(rdp->nocb_wq, rdp->nocb_head);
+ wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
list = ACCESS_ONCE(rdp->nocb_head);
if (!list) {
schedule_timeout_interruptible(1);
+ flush_signals(current);
continue;
}
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abbd..01ab081ac53a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
if (!desc->count)
return 0;
- mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_lock(&file_inode(filp)->i_mutex);
do {
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
- mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&file_inode(filp)->i_mutex);
return desc->written;
}
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 3920d593e63c..ff55247e7049 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
return __res_counter_charge(counter, val, limit_fail_at, true);
}
-void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
+u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
{
if (WARN_ON(counter->usage < val))
val = counter->usage;
counter->usage -= val;
+ return counter->usage;
}
-void res_counter_uncharge_until(struct res_counter *counter,
- struct res_counter *top,
- unsigned long val)
+u64 res_counter_uncharge_until(struct res_counter *counter,
+ struct res_counter *top,
+ unsigned long val)
{
unsigned long flags;
struct res_counter *c;
+ u64 ret = 0;
local_irq_save(flags);
for (c = counter; c != top; c = c->parent) {
+ u64 r;
spin_lock(&c->lock);
- res_counter_uncharge_locked(c, val);
+ r = res_counter_uncharge_locked(c, val);
+ if (c == counter)
+ ret = r;
spin_unlock(&c->lock);
}
local_irq_restore(flags);
+ return ret;
}
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
{
- res_counter_uncharge_until(counter, NULL, val);
+ return res_counter_uncharge_until(counter, NULL, val);
}
static inline unsigned long long *
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c8..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -17,6 +17,7 @@
* See rt.c in preempt-rt for proper credits and further information
*/
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/spinlock.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec49475460..7890b10084a7 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -10,6 +10,7 @@
#include <linux/kthread.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/freezer.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c993..1e09308bf2a1 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -13,6 +13,7 @@
#include <linux/spinlock.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/timer.h>
#include "rtmutex_common.h"
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 6850f53e02d8..b3c6c3fcd847 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_read_nested);
+void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
+{
+ might_sleep();
+ rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+}
+
+EXPORT_SYMBOL(_down_write_nest_lock);
+
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
+ sched_offline_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
if (IS_ERR(tg))
goto out_free;
+ sched_online_group(tg, &root_task_group);
+
kref_init(&ag->kref);
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 01edad9b5d71..7f12624a393c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,7 +83,7 @@
#endif
#include "sched.h"
-#include "../workqueue_sched.h"
+#include "../workqueue_internal.h"
#include "../smpboot.h"
#define CREATE_TRACE_POINTS
@@ -193,23 +193,10 @@ static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int sched_feat_set(char *cmp)
{
- char buf[64];
- char *cmp;
- int neg = 0;
int i;
-
- if (cnt > 63)
- cnt = 63;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- cmp = strstrip(buf);
+ int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
@@ -229,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
}
}
+ return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int i;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ i = sched_feat_set(cmp);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -1124,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
- const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+ int nid = cpu_to_node(cpu);
+ const struct cpumask *nodemask = NULL;
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
- /* Look for allowed, online CPU in same node. */
- for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_online(dest_cpu))
- continue;
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- return dest_cpu;
+ /*
+ * If the node that the cpu is on has been offlined, cpu_to_node()
+ * will return -1. There is no cpu on the node, and we should
+ * select the cpu on the other node.
+ */
+ if (nid != -1) {
+ nodemask = cpumask_of_node(nid);
+
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return dest_cpu;
+ }
}
for (;;) {
@@ -1515,7 +1533,8 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_ALL, 0);
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
@@ -1560,7 +1579,40 @@ static void __sched_fork(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ p->mm->numa_next_scan = jiffies;
+ p->mm->numa_next_reset = jiffies;
+ p->mm->numa_scan_seq = 0;
+ }
+
+ p->node_stamp = 0ULL;
+ p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_SCHED_DEBUG
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sched_feat_set("NUMA");
+ else
+ sched_feat_set("NO_NUMA");
+}
+#else
+__read_mostly bool numabalancing_enabled;
+
+void set_numabalancing_state(bool enabled)
+{
+ numabalancing_enabled = enabled;
}
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_NUMA_BALANCING */
/*
* fork()/clone()-time setup:
@@ -1700,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
@@ -1711,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_out(notifier, next);
}
@@ -1927,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
/*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
*
* externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
+ * threads, total number of context switches performed since bootup.
*/
unsigned long nr_running(void)
{
@@ -1943,23 +1992,6 @@ unsigned long nr_running(void)
return sum;
}
-unsigned long nr_uninterruptible(void)
-{
- unsigned long i, sum = 0;
-
- for_each_possible_cpu(i)
- sum += cpu_rq(i)->nr_uninterruptible;
-
- /*
- * Since we read the counters lockless, it might be slightly
- * inaccurate. Do not allow it to go below zero though:
- */
- if (unlikely((long)sum < 0))
- sum = 0;
-
- return sum;
-}
-
unsigned long long nr_context_switches(void)
{
int i;
@@ -2744,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
if (irqs_disabled())
print_irqtrace_events(prev);
dump_stack();
- add_taint(TAINT_WARN);
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
/*
@@ -3226,7 +3258,8 @@ void complete_all(struct completion *x)
EXPORT_SYMBOL(complete_all);
static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
{
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
@@ -3239,7 +3272,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
- timeout = schedule_timeout(timeout);
+ timeout = action(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
@@ -3250,17 +3283,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
return timeout ?: 1;
}
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
- timeout = do_wait_for_common(x, timeout, state);
+ timeout = do_wait_for_common(x, action, timeout, state);
spin_unlock_irq(&x->wait.lock);
return timeout;
}
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
/**
* wait_for_completion: - waits for completion of a task
* @x: holds the state of this particular completion
@@ -3297,6 +3343,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
EXPORT_SYMBOL(wait_for_completion_timeout);
/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
* @x: holds the state of this particular completion
*
@@ -4056,8 +4135,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
- goto out_unlock;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
retval = security_task_setscheduler(p);
if (retval)
@@ -4326,7 +4411,7 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
struct task_struct *curr = current;
struct rq *rq, *p_rq;
unsigned long flags;
- bool yielded = 0;
+ int yielded = 0;
local_irq_save(flags);
rq = this_rq();
@@ -4632,6 +4717,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
+ vtime_init_idle(idle);
#if defined(CONFIG_SMP)
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
@@ -7125,7 +7211,6 @@ static void free_sched_group(struct task_group *tg)
struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- unsigned long flags;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -7137,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ return tg;
+
+err:
+ free_sched_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
@@ -7146,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- return tg;
-
-err:
- free_sched_group(tg);
- return ERR_PTR(-ENOMEM);
}
/* rcu callback to free various structures associated with a task group */
@@ -7164,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
unsigned long flags;
int i;
@@ -7175,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg)
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
}
/* change task's runqueue when it moves between groups.
@@ -7473,6 +7566,25 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /* make sure that internally we keep jiffies */
+ /* also, writing zero resets timeslice to default */
+ if (!ret && write) {
+ sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+ RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+ return ret;
+}
+
int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -7529,6 +7641,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
return &tg->css;
}
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *parent;
+
+ if (!cgrp->parent)
+ return 0;
+
+ parent = cgroup_tg(cgrp->parent);
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_free(struct cgroup *cgrp)
{
struct task_group *tg = cgroup_tg(cgrp);
@@ -7536,6 +7661,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
sched_destroy_group(tg);
}
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ sched_offline_group(tg);
+}
+
static int cpu_cgroup_can_attach(struct cgroup *cgrp,
struct cgroup_taskset *tset)
{
@@ -7891,6 +8023,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.css_alloc = cpu_cgroup_css_alloc,
.css_free = cpu_cgroup_css_free,
+ .css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
*/
#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202fcf79..ed12cbb135f4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
#include <linux/tsacct_kern.h>
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
+#include <linux/context_tracking.h>
#include "sched.h"
@@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
task_group_account_field(p, index, (__force u64) cputime);
/* Account for user time used */
- acct_update_integrals(p);
+ acct_account_cputime(p);
}
/*
@@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
task_group_account_field(p, index, (__force u64) cputime);
/* Account for system time used */
- acct_update_integrals(p);
+ acct_account_cputime(p);
}
/*
@@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
struct signal_struct *sig = tsk->signal;
+ cputime_t utime, stime;
struct task_struct *t;
times->utime = sig->utime;
@@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
t = tsk;
do {
- times->utime += t->utime;
- times->stime += t->stime;
+ task_cputime(tsk, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);
} while_each_thread(tsk, t);
out:
rcu_read_unlock();
}
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
* Account a tick to a process and cpustat
@@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)
irqtime_account_process_tick(current, 0, rq);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+static inline void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
@@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
+ if (vtime_accounting_enabled())
+ return;
+
if (sched_clock_irqtime) {
irqtime_account_process_tick(p, user_tick, rq);
return;
@@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks)
account_idle_time(jiffies_to_cputime(ticks));
}
-
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
/*
* Use precise platform statistics if available:
@@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*st = cputime.stime;
}
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- vtime_account_system(tsk);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_task_switch(struct task_struct *prev)
{
+ if (!vtime_accounting_enabled())
+ return;
+
if (is_idle_task(prev))
vtime_account_idle(prev);
else
vtime_account_system(prev);
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
vtime_account_user(prev);
+#endif
arch_vtime_task_switch(prev);
}
#endif
@@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev)
* vtime_account().
*/
#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
{
- if (in_interrupt() || !is_idle_task(tsk))
- vtime_account_system(tsk);
- else
- vtime_account_idle(tsk);
+ if (!vtime_accounting_enabled())
+ return;
+
+ if (!in_interrupt()) {
+ /*
+ * If we interrupted user, context_tracking_in_user()
+ * is 1 because the context tracking don't hook
+ * on irq entry/exit. This way we know if
+ * we need to flush user time on kernel entry.
+ */
+ if (context_tracking_in_user()) {
+ vtime_account_user(tsk);
+ return;
+ }
+
+ if (is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ return;
+ }
+ }
+ vtime_account_system(tsk);
}
-EXPORT_SYMBOL_GPL(vtime_account);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#else
-
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
-#endif
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
{
u64 temp = (__force u64) rtime;
- temp *= (__force u64) utime;
+ temp *= (__force u64) stime;
if (sizeof(cputime_t) == 4)
temp = div_u64(temp, (__force u32) total);
@@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr,
struct cputime *prev,
cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, utime, total;
+ cputime_t rtime, stime, total;
- utime = curr->utime;
- total = utime + curr->stime;
+ stime = curr->stime;
+ total = stime + curr->utime;
/*
* Tick based cputime accounting depend on random scheduling
@@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr,
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
if (total)
- utime = scale_utime(utime, rtime, total);
+ stime = scale_stime(stime, rtime, total);
else
- utime = rtime;
+ stime = rtime;
/*
* If the tick based count grows faster than the scheduler one,
* the result of the scaling may go backward.
* Let's enforce monotonicity.
*/
- prev->utime = max(prev->utime, utime);
- prev->stime = max(prev->stime, rtime - prev->utime);
+ prev->stime = max(prev->stime, stime);
+ prev->utime = max(prev->utime, rtime - prev->stime);
*ut = prev->utime;
*st = prev->stime;
@@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr,
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime = {
- .utime = p->utime,
- .stime = p->stime,
.sum_exec_runtime = p->se.sum_exec_runtime,
};
+ task_cputime(p, &cputime.utime, &cputime.stime);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
@@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
thread_group_cputime(p, &cputime);
cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
}
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static unsigned long long vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long clock;
+
+ clock = local_clock();
+ if (clock < tsk->vtime_snap)
+ return 0;
+
+ return clock - tsk->vtime_snap;
+}
+
+static cputime_t get_vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long delta = vtime_delta(tsk);
+
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+ tsk->vtime_snap += delta;
+
+ /* CHECKME: always safe to convert nsecs to cputime? */
+ return nsecs_to_cputime(delta);
+}
+
+static void __vtime_account_system(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_irq_exit(struct task_struct *tsk)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ write_seqlock(&tsk->vtime_seqlock);
+ if (context_tracking_in_user())
+ tsk->vtime_snap_whence = VTIME_USER;
+ __vtime_account_system(tsk);
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_user(struct task_struct *tsk)
+{
+ cputime_t delta_cpu;
+
+ if (!vtime_accounting_enabled())
+ return;
+
+ delta_cpu = get_vtime_delta(tsk);
+
+ write_seqlock(&tsk->vtime_seqlock);
+ tsk->vtime_snap_whence = VTIME_SYS;
+ account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_user_enter(struct task_struct *tsk)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ write_seqlock(&tsk->vtime_seqlock);
+ tsk->vtime_snap_whence = VTIME_USER;
+ __vtime_account_system(tsk);
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_guest_enter(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags |= PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_guest_exit(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags &= ~PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_idle_time(delta_cpu);
+}
+
+bool vtime_accounting_enabled(void)
+{
+ return context_tracking_active();
+}
+
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+ write_seqlock(&prev->vtime_seqlock);
+ prev->vtime_snap_whence = VTIME_SLEEPING;
+ write_sequnlock(&prev->vtime_seqlock);
+
+ write_seqlock(&current->vtime_seqlock);
+ current->vtime_snap_whence = VTIME_SYS;
+ current->vtime_snap = sched_clock();
+ write_sequnlock(&current->vtime_seqlock);
+}
+
+void vtime_init_idle(struct task_struct *t)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&t->vtime_seqlock, flags);
+ t->vtime_snap_whence = VTIME_SYS;
+ t->vtime_snap = sched_clock();
+ write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+}
+
+cputime_t task_gtime(struct task_struct *t)
+{
+ unsigned int seq;
+ cputime_t gtime;
+
+ do {
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ gtime = t->gtime;
+ if (t->flags & PF_VCPU)
+ gtime += vtime_delta(t);
+
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+
+ return gtime;
+}
+
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+static void
+fetch_task_cputime(struct task_struct *t,
+ cputime_t *u_dst, cputime_t *s_dst,
+ cputime_t *u_src, cputime_t *s_src,
+ cputime_t *udelta, cputime_t *sdelta)
+{
+ unsigned int seq;
+ unsigned long long delta;
+
+ do {
+ *udelta = 0;
+ *sdelta = 0;
+
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ if (u_dst)
+ *u_dst = *u_src;
+ if (s_dst)
+ *s_dst = *s_src;
+
+ /* Task is sleeping, nothing to add */
+ if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ is_idle_task(t))
+ continue;
+
+ delta = vtime_delta(t);
+
+ /*
+ * Task runs either in user or kernel space, add pending nohz time to
+ * the right place.
+ */
+ if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
+ *udelta = delta;
+ } else {
+ if (t->vtime_snap_whence == VTIME_SYS)
+ *sdelta = delta;
+ }
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+}
+
+
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utime, stime, &t->utime,
+ &t->stime, &udelta, &sdelta);
+ if (utime)
+ *utime += udelta;
+ if (stime)
+ *stime += sdelta;
+}
+
+void task_cputime_scaled(struct task_struct *t,
+ cputime_t *utimescaled, cputime_t *stimescaled)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utimescaled, stimescaled,
+ &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
+ if (utimescaled)
+ *utimescaled += cputime_to_scaled(udelta);
+ if (stimescaled)
+ *stimescaled += cputime_to_scaled(sdelta);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2cd3c1b4e582..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- /*
- * May be NULL if the underlying cgroup isn't fully-created yet
- */
- if (!tg->css.cgroup) {
- group_path[0] = '\0';
- return group_path;
- }
cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
return group_path;
}
@@ -222,8 +215,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->runnable_load_avg);
SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
cfs_rq->blocked_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
- atomic64_read(&cfs_rq->tg->load_avg));
+ SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg",
+ (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));
SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
@@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
{
unsigned int freq = cpu_khz ? : 1;
- SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+ SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
#else
- SEQ_printf(m, "\ncpu#%d\n", cpu);
+ SEQ_printf(m, "cpu#%d\n", cpu);
#endif
#define P(x) \
@@ -330,6 +323,7 @@ do { \
print_rq(m, rq, cpu);
rcu_read_unlock();
spin_unlock_irqrestore(&sched_debug_lock, flags);
+ SEQ_printf(m, "\n");
}
static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
"linear"
};
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
{
u64 ktime, sched_clk, cpu_clk;
unsigned long flags;
- int cpu;
local_irq_save(flags);
ktime = ktime_to_ns(ktime_get());
@@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
#undef PN
#undef P
- SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+ SEQ_printf(m, " .%-40s: %d (%s)\n",
+ "sysctl_sched_tunable_scaling",
sysctl_sched_tunable_scaling,
sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+ SEQ_printf(m, "\n");
+}
- for_each_online_cpu(cpu)
- print_cpu(m, cpu);
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ int cpu = (unsigned long)(v - 2);
- SEQ_printf(m, "\n");
+ if (cpu != -1)
+ print_cpu(m, cpu);
+ else
+ sched_debug_header(m);
return 0;
}
void sysrq_sched_debug_show(void)
{
- sched_debug_show(NULL, NULL);
+ int cpu;
+
+ sched_debug_header(NULL);
+ for_each_online_cpu(cpu)
+ print_cpu(NULL, cpu);
+
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return sched_debug_start(file, offset);
+}
+
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations sched_debug_sops = {
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
+};
+
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+ seq_release(inode, file);
+
+ return 0;
}
static int sched_debug_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_debug_show, NULL);
+ int ret = 0;
+
+ ret = seq_open(filp, &sched_debug_sops);
+
+ return ret;
}
static const struct file_operations sched_debug_fops = {
.open = sched_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = sched_debug_release,
};
static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 59e072b2db97..7a33e5986fc5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
#include <trace/events/sched.h>
@@ -774,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq;
+
+ if (!p->mm) /* for example, ksmd faulting in a user's mm */
+ return;
+ seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+
+ /* FIXME: Scheduling placement policy hints go here */
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages, bool migrated)
+{
+ struct task_struct *p = current;
+
+ if (!sched_feat_numa(NUMA))
+ return;
+
+ /* FIXME: Allocate task-specific structure for placement policy here */
+
+ /*
+ * If pages are properly placed (did not migrate) then scan slower.
+ * This is reset periodically in case of phase changes
+ */
+ if (!migrated)
+ p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+ p->numa_scan_period + jiffies_to_msecs(10));
+
+ task_numa_placement(p);
+}
+
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+ ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ p->mm->numa_scan_offset = 0;
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+ unsigned long migrate, next_scan, now = jiffies;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+ unsigned long start, end;
+ long pages;
+
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+ work->next = work; /* protect against double add */
+ /*
+ * Who cares about NUMA placement when they're dying.
+ *
+ * NOTE: make sure not to dereference p->mm before this check,
+ * exit_task_work() happens _after_ exit_mm() so we could be called
+ * without p->mm even though we still had it when we enqueued this
+ * work.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * We do not care about task placement until a task runs on a node
+ * other than the first one used by the address space. This is
+ * largely because migrations are driven by what CPU the task
+ * is running on. If it's never scheduled on another node, it'll
+ * not migrate so why bother trapping the fault.
+ */
+ if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+ mm->first_nid = numa_node_id();
+ if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+ /* Are we running on a new node yet? */
+ if (numa_node_id() == mm->first_nid &&
+ !sched_feat_numa(NUMA_FORCE))
+ return;
+
+ mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+ }
+
+ /*
+ * Reset the scan period if enough time has gone by. Objective is that
+ * scanning will be reduced if pages are properly placed. As tasks
+ * can enter different phases this needs to be re-examined. Lacking
+ * proper tracking of reference behaviour, this blunt hammer is used.
+ */
+ migrate = mm->numa_next_reset;
+ if (time_after(now, migrate)) {
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+ xchg(&mm->numa_next_reset, next_scan);
+ }
+
+ /*
+ * Enforce maximal scan/migration frequency..
+ */
+ migrate = mm->numa_next_scan;
+ if (time_before(now, migrate))
+ return;
+
+ if (p->numa_scan_period == 0)
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+
+ next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ return;
+
+ /*
+ * Do not set pte_numa if the current running node is rate-limited.
+ * This loses statistics on the fault but if we are unwilling to
+ * migrate to this node, it is less likely we can do useful work
+ */
+ if (migrate_ratelimited(numa_node_id()))
+ return;
+
+ start = mm->numa_scan_offset;
+ pages = sysctl_numa_balancing_scan_size;
+ pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+ if (!pages)
+ return;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ if (!vma) {
+ reset_ptenuma_scan(p);
+ start = 0;
+ vma = mm->mmap;
+ }
+ for (; vma; vma = vma->vm_next) {
+ if (!vma_migratable(vma))
+ continue;
+
+ /* Skip small VMAs. They are not likely to be of relevance */
+ if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+ continue;
+
+ do {
+ start = max(start, vma->vm_start);
+ end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+ end = min(end, vma->vm_end);
+ pages -= change_prot_numa(vma, start, end);
+
+ start = end;
+ if (pages <= 0)
+ goto out;
+ } while (end != vma->vm_end);
+ }
+
+out:
+ /*
+ * It is possible to reach the end of the VMA list but the last few VMAs are
+ * not guaranteed to the vma_migratable. If they are not, we would find the
+ * !migratable VMA on the next scan but not reset the scanner to the start
+ * so check it now.
+ */
+ if (vma)
+ mm->numa_scan_offset = start;
+ else
+ reset_ptenuma_scan(p);
+ up_read(&mm->mmap_sem);
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_work;
+ u64 period, now;
+
+ /*
+ * We don't care about NUMA placement if we don't have memory.
+ */
+ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ return;
+
+ /*
+ * Using runtime rather than walltime has the dual advantage that
+ * we (mostly) drive the selection from busy threads and that the
+ * task needs to have done some actual work before we bother with
+ * NUMA placement.
+ */
+ now = curr->se.sum_exec_runtime;
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+ if (now - curr->node_stamp > period) {
+ if (!curr->node_stamp)
+ curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ curr->node_stamp = now;
+
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ task_work_add(curr, work, true);
+ }
+ }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -1265,7 +1492,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
}
__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
- update_cfs_shares(cfs_rq);
}
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1454,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
}
/* ensure we never gain time by being placed backwards. */
- vruntime = max_vruntime(se->vruntime, vruntime);
-
- se->vruntime = vruntime;
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -1475,8 +1699,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- account_entity_enqueue(cfs_rq, se);
enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+ account_entity_enqueue(cfs_rq, se);
+ update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -1549,6 +1774,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -1568,8 +1794,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
+ se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
- dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1583,7 +1809,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
- se->on_rq = 0;
+ update_cfs_shares(cfs_rq);
}
/*
@@ -2435,7 +2661,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
-static void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct cfs_rq *cfs_rq;
@@ -2595,8 +2821,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
+ update_cfs_shares(cfs_rq);
update_entity_load_avg(se, 1);
- update_cfs_rq_blocked_load(cfs_rq, 0);
}
if (!se) {
@@ -2656,8 +2882,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
+ update_cfs_shares(cfs_rq);
update_entity_load_avg(se, 1);
- update_cfs_rq_blocked_load(cfs_rq, 0);
}
if (!se) {
@@ -3026,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
- int cpu = smp_processor_id();
- int prev_cpu = task_cpu(p);
struct sched_domain *sd;
struct sched_group *sg;
- int i;
+ int i = task_cpu(p);
- /*
- * If the task is going to be woken-up on this cpu and if it is
- * already idle, then it is the right target.
- */
- if (target == cpu && idle_cpu(cpu))
- return cpu;
+ if (idle_cpu(target))
+ return target;
/*
- * If the task is going to be woken-up on the cpu where it previously
- * ran and if it is currently idle, then it the right target.
+ * If the prevous cpu is cache affine and idle, don't be stupid.
*/
- if (target == prev_cpu && idle_cpu(prev_cpu))
- return prev_cpu;
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
@@ -3058,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
goto next;
for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
+ if (i == target || !idle_cpu(i))
goto next;
}
@@ -5500,6 +5719,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
+ if (sched_feat_numa(NUMA))
+ task_tick_numa(rq, curr);
+
update_rq_runnable_avg(rq, 1);
}
@@ -5837,11 +6059,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
- for_each_sched_entity(se) {
+ for_each_sched_entity(se)
update_cfs_shares(group_cfs_rq(se));
- /* update contribution to parent */
- update_entity_load_avg(se, 1);
- }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -5873,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
return rr_interval;
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e68e69ab917d..1ad1d2b5395f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -66,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
+
+/*
+ * Apply the automatic NUMA scheduling policy. Enabled automatically
+ * at runtime if running on a NUMA machine. Can be controlled via
+ * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * for debugging the core machinery.
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA, false)
+SCHED_FEAT(NUMA_FORCE, false)
+#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 418feb01344e..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
#include <linux/slab.h>
+int sched_rr_timeslice = RR_TIMESLICE;
+
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
struct rt_bandwidth def_rt_bandwidth;
@@ -566,7 +568,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
static int do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
int i, weight, more = 0;
u64 rt_period;
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
return;
delta_exec = rq->clock_task - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
- (p->nr_cpus_allowed > 1))
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
return 1;
return 0;
}
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (p->on_rq && !rq->rt.rt_nr_running)
- pull_rt_task(rq);
+ if (!p->on_rq || rq->rt.rt_nr_running)
+ return;
+
+ if (pull_rt_task(rq))
+ resched_task(rq->curr);
}
void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
if (soft != RLIM_INFINITY) {
unsigned long next;
- p->rt.timeout++;
+ if (p->rt.watchdog_stamp != jiffies) {
+ p->rt.timeout++;
+ p->rt.watchdog_stamp = jiffies;
+ }
+
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
if (p->rt.timeout > next)
p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
if (--p->rt.time_slice)
return;
- p->rt.time_slice = RR_TIMESLICE;
+ p->rt.time_slice = sched_rr_timeslice;
/*
* Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
* Time slice is 0 for SCHED_FIFO tasks
*/
if (task->policy == SCHED_RR)
- return RR_TIMESLICE;
+ return sched_rr_timeslice;
else
return 0;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5eca173b563f..cc03cfdf469f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
@@ -663,6 +665,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#ifdef CONFIG_SCHED_DEBUG
+#define numabalancing_enabled sched_feat_numa(NUMA)
+#else
+extern bool numabalancing_enabled;
+#endif /* CONFIG_SCHED_DEBUG */
+#else
+#define sched_feat_numa(x) (0)
+#define numabalancing_enabled (0)
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline u64 global_rt_period(void)
{
return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e8872..e036eda1a9c9 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
if (mask_str == NULL)
return -ENOMEM;
- seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
- seq_printf(seq, "timestamp %lu\n", jiffies);
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ if (v == (void *)1) {
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ } else {
+ struct rq *rq;
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
#endif
+ cpu = (unsigned long)(v - 2);
+ rq = cpu_rq(cpu);
/* runqueue-specific stats */
seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
return 0;
}
-static int schedstat_open(struct inode *inode, struct file *file)
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
{
- unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
+ unsigned long n = *offset;
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_schedstat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return schedstat_start(file, offset);
+}
+
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations schedstat_sops = {
+ .start = schedstat_start,
+ .next = schedstat_next,
+ .stop = schedstat_stop,
+ .show = show_schedstat,
+};
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &schedstat_sops);
}
+static int schedstat_release(struct inode *inode, struct file *file)
+{
+ return 0;
+};
+
static const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = schedstat_release,
};
static int __init proc_schedstat_init(void)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ee376beedaf9..5af44b593770 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall)
#ifdef CONFIG_SECCOMP_FILTER
case SECCOMP_MODE_FILTER: {
int data;
+ struct pt_regs *regs = task_pt_regs(current);
ret = seccomp_run_filters(this_syscall);
data = ret & SECCOMP_RET_DATA;
ret &= SECCOMP_RET_ACTION;
switch (ret) {
case SECCOMP_RET_ERRNO:
/* Set the low-order 16-bits as a errno. */
- syscall_set_return_value(current, task_pt_regs(current),
+ syscall_set_return_value(current, regs,
-data, 0);
goto skip;
case SECCOMP_RET_TRAP:
/* Show the handler the original registers. */
- syscall_rollback(current, task_pt_regs(current));
+ syscall_rollback(current, regs);
/* Let the filter pass back 16 bits of data. */
seccomp_send_sigsys(this_syscall, data);
goto skip;
case SECCOMP_RET_TRACE:
/* Skip these calls if there is no tracer. */
- if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current, regs,
+ -ENOSYS, 0);
goto skip;
+ }
/* Allow the BPF to provide the event message */
ptrace_event(PTRACE_EVENT_SECCOMP, data);
/*
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall)
*/
if (fatal_signal_pending(current))
break;
+ if (syscall_get_nr(current, regs) < 0)
+ goto skip; /* Explicit request to skip. */
+
return 0;
case SECCOMP_RET_ALLOW:
return 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index a49c7f36ceb3..2ec870a4c3c4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -31,6 +31,7 @@
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/uprobes.h>
+#include <linux/compat.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -679,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* No need to set need_resched since signal event passing
* goes through ->blocked
*/
-void signal_wake_up(struct task_struct *t, int resume)
+void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
- unsigned int mask;
-
set_tsk_thread_flag(t, TIF_SIGPENDING);
-
/*
- * For SIGKILL, we want to wake it up in the stopped/traced/killable
+ * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
* executing another processor and just now entering stopped state.
* By using wake_up_state, we ensure the process will wake up and
* handle its death signal.
*/
- mask = TASK_INTERRUPTIBLE;
- if (resume)
- mask |= TASK_WAKEKILL;
- if (!wake_up_state(t, mask))
+ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
kick_process(t);
}
@@ -843,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t)
assert_spin_locked(&t->sighand->siglock);
task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
- signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+ ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
}
/*
@@ -1162,11 +1157,11 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
static void print_fatal_signal(int signr)
{
struct pt_regs *regs = signal_pt_regs();
- printk("%s/%d: potentially unexpected fatal signal %d.\n",
+ printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
current->comm, task_pid_nr(current), signr);
#if defined(__i386__) && !defined(__arch_um__)
- printk("code at %08lx: ", regs->ip);
+ printk(KERN_INFO "code at %08lx: ", regs->ip);
{
int i;
for (i = 0; i < 16; i++) {
@@ -1174,11 +1169,11 @@ static void print_fatal_signal(int signr)
if (get_user(insn, (unsigned char *)(regs->ip + i)))
break;
- printk("%02x ", insn);
+ printk(KERN_CONT "%02x ", insn);
}
}
+ printk(KERN_CONT "\n");
#endif
- printk("\n");
preempt_disable();
show_regs(regs);
preempt_enable();
@@ -1637,6 +1632,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
unsigned long flags;
struct sighand_struct *psig;
bool autoreap = false;
+ cputime_t utime, stime;
BUG_ON(sig == -1);
@@ -1674,8 +1670,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
task_uid(tsk));
rcu_read_unlock();
- info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
- info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
+ task_cputime(tsk, &utime, &stime);
+ info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
+ info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
@@ -1739,6 +1736,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
+ cputime_t utime, stime;
if (for_ptracer) {
parent = tsk->parent;
@@ -1753,12 +1751,13 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
* see comment in do_notify_parent() about the following 4 lines
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
rcu_read_unlock();
- info.si_utime = cputime_to_clock_t(tsk->utime);
- info.si_stime = cputime_to_clock_t(tsk->stime);
+ task_cputime(tsk, &utime, &stime);
+ info.si_utime = cputime_to_clock_t(utime);
+ info.si_stime = cputime_to_clock_t(stime);
info.si_code = why;
switch (why) {
@@ -1799,6 +1798,10 @@ static inline int may_ptrace_stop(void)
* If SIGKILL was already sent before the caller unlocked
* ->siglock we must see ->core_state != NULL. Otherwise it
* is safe to enter schedule().
+ *
+ * This is almost outdated, a task with the pending SIGKILL can't
+ * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
+ * after SIGKILL was already dequeued.
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
@@ -1924,6 +1927,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
if (gstop_done)
do_notify_parent_cldstop(current, false, why);
+ /* tasklist protects us from ptrace_freeze_traced() */
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
@@ -2395,6 +2399,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
tracehook_signal_handler(sig, info, ka, regs, stepping);
}
+void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
+{
+ if (failed)
+ force_sigsegv(ksig->sig, current);
+ else
+ signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
+ signal_pt_regs(), stepping);
+}
+
/*
* It could be that complete_signal() picked us to notify about the
* group-wide signal. Other threads should be notified now to take
@@ -2527,11 +2540,8 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
*/
void set_current_blocked(sigset_t *newset)
{
- struct task_struct *tsk = current;
sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
- spin_lock_irq(&tsk->sighand->siglock);
- __set_task_blocked(tsk, newset);
- spin_unlock_irq(&tsk->sighand->siglock);
+ __set_current_blocked(newset);
}
void __set_current_blocked(const sigset_t *newset)
@@ -2615,28 +2625,58 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
return 0;
}
-long do_sigpending(void __user *set, unsigned long sigsetsize)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
+ compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
- long error = -EINVAL;
- sigset_t pending;
+#ifdef __BIG_ENDIAN
+ sigset_t old_set = current->blocked;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (nset) {
+ compat_sigset_t new32;
+ sigset_t new_set;
+ int error;
+ if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&new_set, &new32);
+ sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ error = sigprocmask(how, &new_set, NULL);
+ if (error)
+ return error;
+ }
+ if (oset) {
+ compat_sigset_t old32;
+ sigset_to_compat(&old32, &old_set);
+ if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ }
+ return 0;
+#else
+ return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
+ (sigset_t __user *)oset, sigsetsize);
+#endif
+}
+#endif
+
+static int do_sigpending(void *set, unsigned long sigsetsize)
+{
if (sigsetsize > sizeof(sigset_t))
- goto out;
+ return -EINVAL;
spin_lock_irq(&current->sighand->siglock);
- sigorsets(&pending, &current->pending.signal,
+ sigorsets(set, &current->pending.signal,
&current->signal->shared_pending.signal);
spin_unlock_irq(&current->sighand->siglock);
/* Outside the lock because only this thread touches it. */
- sigandsets(&pending, &current->blocked, &pending);
-
- error = -EFAULT;
- if (!copy_to_user(set, &pending, sigsetsize))
- error = 0;
-
-out:
- return error;
+ sigandsets(set, &current->blocked, set);
+ return 0;
}
/**
@@ -2645,11 +2685,36 @@ out:
* @set: stores pending signals
* @sigsetsize: size of sigset_t type or larger
*/
-SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
+SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
- return do_sigpending(set, sigsetsize);
+ sigset_t set;
+ int err = do_sigpending(&set, sigsetsize);
+ if (!err && copy_to_user(uset, &set, sigsetsize))
+ err = -EFAULT;
+ return err;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
+ compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t set;
+ int err = do_sigpending(&set, sigsetsize);
+ if (!err) {
+ compat_sigset_t set32;
+ sigset_to_compat(&set32, &set);
+ /* we can get here only if sigsetsize <= sizeof(set) */
+ if (copy_to_user(uset, &set32, sigsetsize))
+ err = -EFAULT;
+ }
+ return err;
+#else
+ return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
+#endif
+}
+#endif
+
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
@@ -2926,6 +2991,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
return do_tkill(0, pid, sig);
}
+static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
+{
+ /* Not even root can pretend to send signals from the kernel.
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+ (task_pid_vnr(current) != pid)) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info->si_code < 0);
+ return -EPERM;
+ }
+ info->si_signo = sig;
+
+ /* POSIX.1b doesn't mention process groups. */
+ return kill_proc_info(sig, info, pid);
+}
+
/**
* sys_rt_sigqueueinfo - send signal information to a signal
* @pid: the PID of the thread
@@ -2936,25 +3018,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
siginfo_t __user *, uinfo)
{
siginfo_t info;
-
if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
return -EFAULT;
+ return do_rt_sigqueueinfo(pid, sig, &info);
+}
- /* Not even root can pretend to send signals from the kernel.
- * Nor can they impersonate a kill()/tgkill(), which adds source info.
- */
- if (info.si_code >= 0 || info.si_code == SI_TKILL) {
- /* We used to allow any < 0 si_code */
- WARN_ON_ONCE(info.si_code < 0);
- return -EPERM;
- }
- info.si_signo = sig;
-
- /* POSIX.1b doesn't mention process groups. */
- return kill_proc_info(sig, &info, pid);
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
+ compat_pid_t, pid,
+ int, sig,
+ struct compat_siginfo __user *, uinfo)
+{
+ siginfo_t info;
+ int ret = copy_siginfo_from_user32(&info, uinfo);
+ if (unlikely(ret))
+ return ret;
+ return do_rt_sigqueueinfo(pid, sig, &info);
}
+#endif
-long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
{
/* This is only valid for single tasks */
if (pid <= 0 || tgid <= 0)
@@ -2963,7 +3046,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
- if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+ if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
+ (task_pid_vnr(current) != pid)) {
/* We used to allow any < 0 si_code */
WARN_ON_ONCE(info->si_code < 0);
return -EPERM;
@@ -2984,6 +3068,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
+ compat_pid_t, tgid,
+ compat_pid_t, pid,
+ int, sig,
+ struct compat_siginfo __user *, uinfo)
+{
+ siginfo_t info;
+
+ if (copy_siginfo_from_user32(&info, uinfo))
+ return -EFAULT;
+ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+#endif
+
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct task_struct *t = current;
@@ -3029,7 +3128,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
return 0;
}
-int
+static int
do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
{
stack_t oss;
@@ -3094,6 +3193,76 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
out:
return error;
}
+SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
+{
+ return do_sigaltstack(uss, uoss, current_user_stack_pointer());
+}
+
+int restore_altstack(const stack_t __user *uss)
+{
+ int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
+ /* squash all but EFAULT for now */
+ return err == -EFAULT ? err : 0;
+}
+
+int __save_altstack(stack_t __user *uss, unsigned long sp)
+{
+ struct task_struct *t = current;
+ return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
+ __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ __put_user(t->sas_ss_size, &uss->ss_size);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sigaltstack,
+ const compat_stack_t __user *, uss_ptr,
+ compat_stack_t __user *, uoss_ptr)
+{
+ stack_t uss, uoss;
+ int ret;
+ mm_segment_t seg;
+
+ if (uss_ptr) {
+ compat_stack_t uss32;
+
+ memset(&uss, 0, sizeof(stack_t));
+ if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
+ return -EFAULT;
+ uss.ss_sp = compat_ptr(uss32.ss_sp);
+ uss.ss_flags = uss32.ss_flags;
+ uss.ss_size = uss32.ss_size;
+ }
+ seg = get_fs();
+ set_fs(KERNEL_DS);
+ ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
+ (stack_t __force __user *) &uoss,
+ compat_user_stack_pointer());
+ set_fs(seg);
+ if (ret >= 0 && uoss_ptr) {
+ if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
+ __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
+ __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
+ __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+ ret = -EFAULT;
+ }
+ return ret;
+}
+
+int compat_restore_altstack(const compat_stack_t __user *uss)
+{
+ int err = compat_sys_sigaltstack(uss, NULL);
+ /* squash all but -EFAULT for now */
+ return err == -EFAULT ? err : 0;
+}
+
+int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
+{
+ struct task_struct *t = current;
+ return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
+ __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ __put_user(t->sas_ss_size, &uss->ss_size);
+}
+#endif
#ifdef __ARCH_WANT_SYS_SIGPENDING
@@ -3103,7 +3272,7 @@ out:
*/
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
{
- return do_sigpending(set, sizeof(*set));
+ return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
}
#endif
@@ -3130,7 +3299,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
if (nset) {
if (copy_from_user(&new_set, nset, sizeof(*nset)))
return -EFAULT;
- new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
new_blocked = current->blocked;
@@ -3148,7 +3316,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
return -EINVAL;
}
- __set_current_blocked(&new_blocked);
+ set_current_blocked(&new_blocked);
}
if (oset) {
@@ -3160,7 +3328,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
-#ifdef __ARCH_WANT_SYS_RT_SIGACTION
+#ifndef CONFIG_ODD_RT_SIGACTION
/**
* sys_rt_sigaction - alter an action taken by a process
* @sig: signal to be sent
@@ -3194,7 +3362,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
out:
return ret;
}
-#endif /* __ARCH_WANT_SYS_RT_SIGACTION */
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
+ const struct compat_sigaction __user *, act,
+ struct compat_sigaction __user *, oact,
+ compat_size_t, sigsetsize)
+{
+ struct k_sigaction new_ka, old_ka;
+ compat_sigset_t mask;
+#ifdef __ARCH_HAS_SA_RESTORER
+ compat_uptr_t restorer;
+#endif
+ int ret;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+
+ if (act) {
+ compat_uptr_t handler;
+ ret = get_user(handler, &act->sa_handler);
+ new_ka.sa.sa_handler = compat_ptr(handler);
+#ifdef __ARCH_HAS_SA_RESTORER
+ ret |= get_user(restorer, &act->sa_restorer);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+#endif
+ ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+ ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+ if (ret)
+ return -EFAULT;
+ sigset_from_compat(&new_ka.sa.sa_mask, &mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+ if (!ret && oact) {
+ sigset_to_compat(&mask, &old_ka.sa.sa_mask);
+ ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
+ &oact->sa_handler);
+ ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+ ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+#ifdef __ARCH_HAS_SA_RESTORER
+ ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+ &oact->sa_restorer);
+#endif
+ }
+ return ret;
+}
+#endif
+#endif /* !CONFIG_ODD_RT_SIGACTION */
+
+#ifdef CONFIG_OLD_SIGACTION
+SYSCALL_DEFINE3(sigaction, int, sig,
+ const struct old_sigaction __user *, act,
+ struct old_sigaction __user *, oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+
+ if (act) {
+ old_sigset_t mask;
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+ __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(mask, &act->sa_mask))
+ return -EFAULT;
+#ifdef __ARCH_HAS_KA_RESTORER
+ new_ka.ka_restorer = NULL;
+#endif
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+ __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+#endif
+#ifdef CONFIG_COMPAT_OLD_SIGACTION
+COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
+ const struct compat_old_sigaction __user *, act,
+ struct compat_old_sigaction __user *, oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+ compat_old_sigset_t mask;
+ compat_uptr_t handler, restorer;
+
+ if (act) {
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(handler, &act->sa_handler) ||
+ __get_user(restorer, &act->sa_restorer) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(mask, &act->sa_mask))
+ return -EFAULT;
+
+#ifdef __ARCH_HAS_KA_RESTORER
+ new_ka.ka_restorer = NULL;
+#endif
+ new_ka.sa.sa_handler = compat_ptr(handler);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_handler),
+ &oact->sa_handler) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+ &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+ return -EFAULT;
+ }
+ return ret;
+}
+#endif
#ifdef __ARCH_WANT_SYS_SGETMASK
@@ -3212,6 +3505,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
int old = current->blocked.sig[0];
sigset_t newset;
+ siginitset(&newset, newmask);
set_current_blocked(&newset);
return old;
@@ -3261,7 +3555,6 @@ int sigsuspend(sigset_t *set)
return -ERESTARTNOHAND;
}
-#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
/**
* sys_rt_sigsuspend - replace the signal mask for a value with the
* @unewset value until a signal is received
@@ -3280,7 +3573,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
return -EFAULT;
return sigsuspend(&newset);
}
-#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t newset;
+ compat_sigset_t newset32;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&newset, &newset32);
+ return sigsuspend(&newset);
+#else
+ /* on little-endian bitmaps don't care about granularity */
+ return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
+#endif
+}
+#endif
+
+#ifdef CONFIG_OLD_SIGSUSPEND
+SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
+{
+ sigset_t blocked;
+ siginitset(&blocked, mask);
+ return sigsuspend(&blocked);
+}
+#endif
+#ifdef CONFIG_OLD_SIGSUSPEND3
+SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
+{
+ sigset_t blocked;
+ siginitset(&blocked, mask);
+ return sigsuspend(&blocked);
+}
+#endif
__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
{
diff --git a/kernel/smp.c b/kernel/smp.c
index 29dd40a9f2f4..8e451f3ff51b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,23 +16,14 @@
#include "smpboot.h"
#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static struct {
- struct list_head queue;
- raw_spinlock_t lock;
-} call_function __cacheline_aligned_in_smp =
- {
- .queue = LIST_HEAD_INIT(call_function.queue),
- .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
- };
-
enum {
CSD_FLAG_LOCK = 0x01,
};
struct call_function_data {
- struct call_single_data csd;
- atomic_t refs;
+ struct call_single_data __percpu *csd;
cpumask_var_t cpumask;
+ cpumask_var_t cpumask_ipi;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -56,6 +47,14 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return notifier_from_errno(-ENOMEM);
+ if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+ cpu_to_node(cpu)))
+ return notifier_from_errno(-ENOMEM);
+ cfd->csd = alloc_percpu(struct call_single_data);
+ if (!cfd->csd) {
+ free_cpumask_var(cfd->cpumask);
+ return notifier_from_errno(-ENOMEM);
+ }
break;
#ifdef CONFIG_HOTPLUG_CPU
@@ -65,6 +64,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
+ free_percpu(cfd->csd);
break;
#endif
};
@@ -166,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
}
/*
- * Invoked by arch to handle an IPI for call function. Must be called with
- * interrupts disabled.
- */
-void generic_smp_call_function_interrupt(void)
-{
- struct call_function_data *data;
- int cpu = smp_processor_id();
-
- /*
- * Shouldn't receive this interrupt on a cpu that is not yet online.
- */
- WARN_ON_ONCE(!cpu_online(cpu));
-
- /*
- * Ensure entry is visible on call_function_queue after we have
- * entered the IPI. See comment in smp_call_function_many.
- * If we don't have this, then we may miss an entry on the list
- * and never get another IPI to process it.
- */
- smp_mb();
-
- /*
- * It's ok to use list_for_each_rcu() here even though we may
- * delete 'pos', since list_del_rcu() doesn't clear ->next
- */
- list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
- int refs;
- smp_call_func_t func;
-
- /*
- * Since we walk the list without any locks, we might
- * see an entry that was completed, removed from the
- * list and is in the process of being reused.
- *
- * We must check that the cpu is in the cpumask before
- * checking the refs, and both must be set before
- * executing the callback on this cpu.
- */
-
- if (!cpumask_test_cpu(cpu, data->cpumask))
- continue;
-
- smp_rmb();
-
- if (atomic_read(&data->refs) == 0)
- continue;
-
- func = data->csd.func; /* save for later warn */
- func(data->csd.info);
-
- /*
- * If the cpu mask is not still set then func enabled
- * interrupts (BUG), and this cpu took another smp call
- * function interrupt and executed func(info) twice
- * on this cpu. That nested execution decremented refs.
- */
- if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
- WARN(1, "%pf enabled interrupts and double executed\n", func);
- continue;
- }
-
- refs = atomic_dec_return(&data->refs);
- WARN_ON(refs < 0);
-
- if (refs)
- continue;
-
- WARN_ON(!cpumask_empty(data->cpumask));
-
- raw_spin_lock(&call_function.lock);
- list_del_rcu(&data->csd.list);
- raw_spin_unlock(&call_function.lock);
-
- csd_unlock(&data->csd);
- }
-
-}
-
-/*
* Invoked by arch to handle an IPI for call function single. Must be
* called from the arch with interrupts disabled.
*/
@@ -448,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
struct call_function_data *data;
- unsigned long flags;
- int refs, cpu, next_cpu, this_cpu = smp_processor_id();
+ int cpu, next_cpu, this_cpu = smp_processor_id();
/*
* Can deadlock when called with interrupts disabled.
@@ -481,79 +402,46 @@ void smp_call_function_many(const struct cpumask *mask,
}
data = &__get_cpu_var(cfd_data);
- csd_lock(&data->csd);
-
- /* This BUG_ON verifies our reuse assertions and can be removed */
- BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
- /*
- * The global call function queue list add and delete are protected
- * by a lock, but the list is traversed without any lock, relying
- * on the rcu list add and delete to allow safe concurrent traversal.
- * We reuse the call function data without waiting for any grace
- * period after some other cpu removes it from the global queue.
- * This means a cpu might find our data block as it is being
- * filled out.
- *
- * We hold off the interrupt handler on the other cpu by
- * ordering our writes to the cpu mask vs our setting of the
- * refs counter. We assert only the cpu owning the data block
- * will set a bit in cpumask, and each bit will only be cleared
- * by the subject cpu. Each cpu must first find its bit is
- * set and then check that refs is set indicating the element is
- * ready to be processed, otherwise it must skip the entry.
- *
- * On the previous iteration refs was set to 0 by another cpu.
- * To avoid the use of transitivity, set the counter to 0 here
- * so the wmb will pair with the rmb in the interrupt handler.
- */
- atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
-
- data->csd.func = func;
- data->csd.info = info;
-
- /* Ensure 0 refs is visible before mask. Also orders func and info */
- smp_wmb();
-
- /* We rely on the "and" being processed before the store */
cpumask_and(data->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, data->cpumask);
- refs = cpumask_weight(data->cpumask);
/* Some callers race with other cpus changing the passed mask */
- if (unlikely(!refs)) {
- csd_unlock(&data->csd);
+ if (unlikely(!cpumask_weight(data->cpumask)))
return;
- }
- raw_spin_lock_irqsave(&call_function.lock, flags);
/*
- * Place entry at the _HEAD_ of the list, so that any cpu still
- * observing the entry in generic_smp_call_function_interrupt()
- * will not miss any other list entries:
+ * After we put an entry into the list, data->cpumask
+ * may be cleared again when another CPU sends another IPI for
+ * a SMP function call, so data->cpumask will be zero.
*/
- list_add_rcu(&data->csd.list, &call_function.queue);
- /*
- * We rely on the wmb() in list_add_rcu to complete our writes
- * to the cpumask before this write to refs, which indicates
- * data is on the list and is ready to be processed.
- */
- atomic_set(&data->refs, refs);
- raw_spin_unlock_irqrestore(&call_function.lock, flags);
+ cpumask_copy(data->cpumask_ipi, data->cpumask);
- /*
- * Make the list addition visible before sending the ipi.
- * (IPIs must obey or appear to obey normal Linux cache
- * coherency rules -- see comment in generic_exec_single).
- */
- smp_mb();
+ for_each_cpu(cpu, data->cpumask) {
+ struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
+ struct call_single_queue *dst =
+ &per_cpu(call_single_queue, cpu);
+ unsigned long flags;
+
+ csd_lock(csd);
+ csd->func = func;
+ csd->info = info;
+
+ raw_spin_lock_irqsave(&dst->lock, flags);
+ list_add_tail(&csd->list, &dst->list);
+ raw_spin_unlock_irqrestore(&dst->lock, flags);
+ }
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(data->cpumask);
+ arch_send_call_function_ipi_mask(data->cpumask_ipi);
- /* Optionally wait for the CPUs to complete */
- if (wait)
- csd_lock_wait(&data->csd);
+ if (wait) {
+ for_each_cpu(cpu, data->cpumask) {
+ struct call_single_data *csd =
+ per_cpu_ptr(data->csd, cpu);
+ csd_lock_wait(csd);
+ }
+ }
}
EXPORT_SYMBOL(smp_call_function_many);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d6c5fc054242..b9bde5727829 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data)
continue;
}
- BUG_ON(td->cpu != smp_processor_id());
+ //BUG_ON(td->cpu != smp_processor_id());
/* Check for state change setup */
switch (td->status) {
@@ -183,9 +183,10 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
kfree(td);
return PTR_ERR(tsk);
}
-
get_task_struct(tsk);
*per_cpu_ptr(ht->store, cpu) = tsk;
+ if (ht->create)
+ ht->create(cpu);
return 0;
}
@@ -225,7 +226,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
- if (tsk)
+ if (tsk && !ht->selfparking)
kthread_park(tsk);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567babe789..b4d252fd195b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
EXPORT_SYMBOL(local_bh_enable_ip);
/*
- * We restart softirq processing MAX_SOFTIRQ_RESTART times,
- * and we fall back to softirqd after that.
+ * We restart softirq processing for at most 2 ms,
+ * and if need_resched() is not set.
*
- * This number has been established via experimentation.
+ * These limits have been established via experimentation.
* The two things to balance is latency against fairness -
* we want to handle softirqs as soon as possible, but they
* should not be able to lock up the box.
*/
-#define MAX_SOFTIRQ_RESTART 10
+#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
__u32 pending;
- int max_restart = MAX_SOFTIRQ_RESTART;
+ unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
int cpu;
unsigned long old_flags = current->flags;
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
- vtime_account_irq_enter(current);
+ account_irq_enter_time(current);
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
@@ -264,15 +264,16 @@ restart:
local_irq_disable();
pending = local_softirq_pending();
- if (pending && --max_restart)
- goto restart;
+ if (pending) {
+ if (time_before(jiffies, end) && !need_resched())
+ goto restart;
- if (pending)
wakeup_softirqd();
+ }
lockdep_softirq_exit();
- vtime_account_irq_exit(current);
+ account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
@@ -341,7 +342,7 @@ static inline void invoke_softirq(void)
*/
void irq_exit(void)
{
- vtime_account_irq_exit(current);
+ account_irq_exit_time(current);
trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2b859828cdc3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
*/
void cleanup_srcu_struct(struct srcu_struct *sp)
{
- int sum;
-
- sum = srcu_readers_active(sp);
- WARN_ON(sum); /* Leakage unless caller handles error. */
- if (sum != 0)
- return;
+ if (WARN_ON(srcu_readers_active(sp)))
+ return; /* Leakage unless caller handles error. */
free_percpu(sp->per_cpu_ref);
sp->per_cpu_ref = NULL;
}
@@ -302,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
{
int idx;
+ idx = ACCESS_ONCE(sp->completed) & 0x1;
preempt_disable();
- idx = rcu_dereference_index_check(sp->completed,
- rcu_read_lock_sched_held()) & 0x1;
ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
smp_mb(); /* B */ /* Avoid leaking the critical section. */
ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
@@ -321,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
*/
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
{
- preempt_disable();
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
- preempt_enable();
+ this_cpu_dec(sp->per_cpu_ref->c[idx]);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -423,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
!lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+ might_sleep();
init_completion(&rcu.completion);
head->next = NULL;
@@ -455,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
* synchronize_srcu - wait for prior SRCU read-side critical-section completion
* @sp: srcu_struct with which to synchronize.
*
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates. Can block; must be called from
- * process context.
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=((->completed & 1) ^ 1) to drain to zero at first,
+ * and then flip the completed and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
*
* Note that it is illegal to call synchronize_srcu() from the corresponding
* SRCU read-side critical section; doing so will result in deadlock.
@@ -480,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
* Wait for an SRCU grace period to elapse, but be more aggressive about
* spinning rather than blocking when waiting.
*
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier. It is also illegal to call
- * synchronize_srcu_expedited() from the corresponding SRCU read-side
- * critical section; doing so will result in deadlock. However, it is
- * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
- * from some other srcu_struct's read-side critical section, as long as
+ * Note that it is also illegal to call synchronize_srcu_expedited()
+ * from the corresponding SRCU read-side critical section;
+ * doing so will result in deadlock. However, it is perfectly legal
+ * to call synchronize_srcu_expedited() on one srcu_struct from some
+ * other srcu_struct's read-side critical section, as long as
* the resulting graph of srcu_structs is acyclic.
*/
void synchronize_srcu_expedited(struct srcu_struct *sp)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e965715..95d178c62d5a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -18,7 +18,7 @@
#include <linux/stop_machine.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
-
+#include <linux/smpboot.h>
#include <linux/atomic.h>
/*
@@ -37,10 +37,10 @@ struct cpu_stopper {
spinlock_t lock;
bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
- struct task_struct *thread; /* stopper thread */
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
@@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
}
/* queue @work to @stopper. if offline, @work is completed immediately */
-static void cpu_stop_queue_work(struct cpu_stopper *stopper,
- struct cpu_stop_work *work)
+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
+
unsigned long flags;
spin_lock_irqsave(&stopper->lock, flags);
if (stopper->enabled) {
list_add_tail(&work->list, &stopper->works);
- wake_up_process(stopper->thread);
+ wake_up_process(p);
} else
cpu_stop_signal_done(work->done, false);
@@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
cpu_stop_init_done(&done, 1);
- cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+ cpu_stop_queue_work(cpu, &work);
wait_for_completion(&done.completion);
return done.executed ? done.ret : -ENOENT;
}
@@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
- cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+ cpu_stop_queue_work(cpu, work_buf);
}
/* static data for stop_cpus */
@@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
*/
preempt_disable();
for_each_cpu(cpu, cpumask)
- cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
- &per_cpu(stop_cpus_work, cpu));
+ cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
preempt_enable();
}
@@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
return ret;
}
-static int cpu_stopper_thread(void *data)
+static int cpu_stop_should_run(unsigned int cpu)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ unsigned long flags;
+ int run;
+
+ spin_lock_irqsave(&stopper->lock, flags);
+ run = !list_empty(&stopper->works);
+ spin_unlock_irqrestore(&stopper->lock, flags);
+ return run;
+}
+
+static void cpu_stopper_thread(unsigned int cpu)
{
- struct cpu_stopper *stopper = data;
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct cpu_stop_work *work;
int ret;
repeat:
- set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
-
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- return 0;
- }
-
work = NULL;
spin_lock_irq(&stopper->lock);
if (!list_empty(&stopper->works)) {
@@ -273,8 +279,6 @@ repeat:
struct cpu_stop_done *done = work->done;
char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
- __set_current_state(TASK_RUNNING);
-
/* cpu stop callbacks are not allowed to sleep */
preempt_disable();
@@ -290,88 +294,55 @@ repeat:
ksym_buf), arg);
cpu_stop_signal_done(done, true);
- } else
- schedule();
-
- goto repeat;
+ goto repeat;
+ }
}
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
-static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static void cpu_stop_create(unsigned int cpu)
+{
+ sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
+}
+
+static void cpu_stop_park(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- struct task_struct *p;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- BUG_ON(stopper->thread || stopper->enabled ||
- !list_empty(&stopper->works));
- p = kthread_create_on_node(cpu_stopper_thread,
- stopper,
- cpu_to_node(cpu),
- "migration/%d", cpu);
- if (IS_ERR(p))
- return notifier_from_errno(PTR_ERR(p));
- get_task_struct(p);
- kthread_bind(p, cpu);
- sched_set_stop_task(cpu, p);
- stopper->thread = p;
- break;
-
- case CPU_ONLINE:
- /* strictly unnecessary, as first user will wake it */
- wake_up_process(stopper->thread);
- /* mark enabled */
- spin_lock_irq(&stopper->lock);
- stopper->enabled = true;
- spin_unlock_irq(&stopper->lock);
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_POST_DEAD:
- {
- struct cpu_stop_work *work;
-
- sched_set_stop_task(cpu, NULL);
- /* kill the stopper */
- kthread_stop(stopper->thread);
- /* drain remaining works */
- spin_lock_irq(&stopper->lock);
- list_for_each_entry(work, &stopper->works, list)
- cpu_stop_signal_done(work->done, false);
- stopper->enabled = false;
- spin_unlock_irq(&stopper->lock);
- /* release the stopper */
- put_task_struct(stopper->thread);
- stopper->thread = NULL;
- break;
- }
-#endif
- }
+ struct cpu_stop_work *work;
+ unsigned long flags;
- return NOTIFY_OK;
+ /* drain remaining works */
+ spin_lock_irqsave(&stopper->lock, flags);
+ list_for_each_entry(work, &stopper->works, list)
+ cpu_stop_signal_done(work->done, false);
+ stopper->enabled = false;
+ spin_unlock_irqrestore(&stopper->lock, flags);
}
-/*
- * Give it a higher priority so that cpu stopper is available to other
- * cpu notifiers. It currently shares the same priority as sched
- * migration_notifier.
- */
-static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
- .notifier_call = cpu_stop_cpu_callback,
- .priority = 10,
+static void cpu_stop_unpark(unsigned int cpu)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+ spin_lock_irq(&stopper->lock);
+ stopper->enabled = true;
+ spin_unlock_irq(&stopper->lock);
+}
+
+static struct smp_hotplug_thread cpu_stop_threads = {
+ .store = &cpu_stopper_task,
+ .thread_should_run = cpu_stop_should_run,
+ .thread_fn = cpu_stopper_thread,
+ .thread_comm = "migration/%u",
+ .create = cpu_stop_create,
+ .setup = cpu_stop_unpark,
+ .park = cpu_stop_park,
+ .unpark = cpu_stop_unpark,
+ .selfparking = true,
};
static int __init cpu_stop_init(void)
{
- void *bcpu = (void *)(long)smp_processor_id();
unsigned int cpu;
- int err;
for_each_possible_cpu(cpu) {
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
@@ -380,15 +351,8 @@ static int __init cpu_stop_init(void)
INIT_LIST_HEAD(&stopper->works);
}
- /* start one for the boot cpu */
- err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
- bcpu);
- BUG_ON(err != NOTIFY_OK);
- cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
- register_cpu_notifier(&cpu_stop_cpu_notifier);
-
+ BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
stop_machine_initialized = true;
-
return 0;
}
early_initcall(cpu_stop_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 265b37690421..81f56445fba9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -47,6 +47,7 @@
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
+#include <linux/binfmts.h>
#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
@@ -433,11 +434,12 @@ static DEFINE_MUTEX(reboot_mutex);
SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
void __user *, arg)
{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
char buffer[256];
int ret = 0;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT))
+ if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
return -EPERM;
/* For safety, we require "magic" arguments. */
@@ -453,7 +455,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
* pid_namespace, the command is handled by reboot_pid_ns() which will
* call do_exit().
*/
- ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
+ ret = reboot_pid_ns(pid_ns, cmd);
if (ret)
return ret;
@@ -1792,14 +1794,14 @@ SYSCALL_DEFINE1(umask, int, mask)
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
- struct dentry *dentry;
+ struct inode *inode;
int err;
exe = fdget(fd);
if (!exe.file)
return -EBADF;
- dentry = exe.file->f_path.dentry;
+ inode = file_inode(exe.file);
/*
* Because the original mm->exe_file points to executable file, make
@@ -1807,11 +1809,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
* overall picture.
*/
err = -EACCES;
- if (!S_ISREG(dentry->d_inode->i_mode) ||
+ if (!S_ISREG(inode->i_mode) ||
exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
goto exit;
- err = inode_permission(dentry->d_inode, MAY_EXEC);
+ err = inode_permission(inode, MAY_EXEC);
if (err)
goto exit;
@@ -2012,160 +2014,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = 0;
switch (option) {
- case PR_SET_PDEATHSIG:
- if (!valid_signal(arg2)) {
- error = -EINVAL;
- break;
- }
- me->pdeath_signal = arg2;
- break;
- case PR_GET_PDEATHSIG:
- error = put_user(me->pdeath_signal, (int __user *)arg2);
- break;
- case PR_GET_DUMPABLE:
- error = get_dumpable(me->mm);
+ case PR_SET_PDEATHSIG:
+ if (!valid_signal(arg2)) {
+ error = -EINVAL;
break;
- case PR_SET_DUMPABLE:
- if (arg2 < 0 || arg2 > 1) {
- error = -EINVAL;
- break;
- }
- set_dumpable(me->mm, arg2);
+ }
+ me->pdeath_signal = arg2;
+ break;
+ case PR_GET_PDEATHSIG:
+ error = put_user(me->pdeath_signal, (int __user *)arg2);
+ break;
+ case PR_GET_DUMPABLE:
+ error = get_dumpable(me->mm);
+ break;
+ case PR_SET_DUMPABLE:
+ if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
+ error = -EINVAL;
break;
+ }
+ set_dumpable(me->mm, arg2);
+ break;
- case PR_SET_UNALIGN:
- error = SET_UNALIGN_CTL(me, arg2);
- break;
- case PR_GET_UNALIGN:
- error = GET_UNALIGN_CTL(me, arg2);
- break;
- case PR_SET_FPEMU:
- error = SET_FPEMU_CTL(me, arg2);
- break;
- case PR_GET_FPEMU:
- error = GET_FPEMU_CTL(me, arg2);
- break;
- case PR_SET_FPEXC:
- error = SET_FPEXC_CTL(me, arg2);
- break;
- case PR_GET_FPEXC:
- error = GET_FPEXC_CTL(me, arg2);
- break;
- case PR_GET_TIMING:
- error = PR_TIMING_STATISTICAL;
- break;
- case PR_SET_TIMING:
- if (arg2 != PR_TIMING_STATISTICAL)
- error = -EINVAL;
- break;
- case PR_SET_NAME:
- comm[sizeof(me->comm)-1] = 0;
- if (strncpy_from_user(comm, (char __user *)arg2,
- sizeof(me->comm) - 1) < 0)
- return -EFAULT;
- set_task_comm(me, comm);
- proc_comm_connector(me);
- break;
- case PR_GET_NAME:
- get_task_comm(comm, me);
- if (copy_to_user((char __user *)arg2, comm,
- sizeof(comm)))
- return -EFAULT;
- break;
- case PR_GET_ENDIAN:
- error = GET_ENDIAN(me, arg2);
- break;
- case PR_SET_ENDIAN:
- error = SET_ENDIAN(me, arg2);
- break;
- case PR_GET_SECCOMP:
- error = prctl_get_seccomp();
- break;
- case PR_SET_SECCOMP:
- error = prctl_set_seccomp(arg2, (char __user *)arg3);
- break;
- case PR_GET_TSC:
- error = GET_TSC_CTL(arg2);
- break;
- case PR_SET_TSC:
- error = SET_TSC_CTL(arg2);
- break;
- case PR_TASK_PERF_EVENTS_DISABLE:
- error = perf_event_task_disable();
- break;
- case PR_TASK_PERF_EVENTS_ENABLE:
- error = perf_event_task_enable();
- break;
- case PR_GET_TIMERSLACK:
- error = current->timer_slack_ns;
- break;
- case PR_SET_TIMERSLACK:
- if (arg2 <= 0)
- current->timer_slack_ns =
+ case PR_SET_UNALIGN:
+ error = SET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_GET_UNALIGN:
+ error = GET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_SET_FPEMU:
+ error = SET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_GET_FPEMU:
+ error = GET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_SET_FPEXC:
+ error = SET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_FPEXC:
+ error = GET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_TIMING:
+ error = PR_TIMING_STATISTICAL;
+ break;
+ case PR_SET_TIMING:
+ if (arg2 != PR_TIMING_STATISTICAL)
+ error = -EINVAL;
+ break;
+ case PR_SET_NAME:
+ comm[sizeof(me->comm) - 1] = 0;
+ if (strncpy_from_user(comm, (char __user *)arg2,
+ sizeof(me->comm) - 1) < 0)
+ return -EFAULT;
+ set_task_comm(me, comm);
+ proc_comm_connector(me);
+ break;
+ case PR_GET_NAME:
+ get_task_comm(comm, me);
+ if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
+ return -EFAULT;
+ break;
+ case PR_GET_ENDIAN:
+ error = GET_ENDIAN(me, arg2);
+ break;
+ case PR_SET_ENDIAN:
+ error = SET_ENDIAN(me, arg2);
+ break;
+ case PR_GET_SECCOMP:
+ error = prctl_get_seccomp();
+ break;
+ case PR_SET_SECCOMP:
+ error = prctl_set_seccomp(arg2, (char __user *)arg3);
+ break;
+ case PR_GET_TSC:
+ error = GET_TSC_CTL(arg2);
+ break;
+ case PR_SET_TSC:
+ error = SET_TSC_CTL(arg2);
+ break;
+ case PR_TASK_PERF_EVENTS_DISABLE:
+ error = perf_event_task_disable();
+ break;
+ case PR_TASK_PERF_EVENTS_ENABLE:
+ error = perf_event_task_enable();
+ break;
+ case PR_GET_TIMERSLACK:
+ error = current->timer_slack_ns;
+ break;
+ case PR_SET_TIMERSLACK:
+ if (arg2 <= 0)
+ current->timer_slack_ns =
current->default_timer_slack_ns;
- else
- current->timer_slack_ns = arg2;
- break;
- case PR_MCE_KILL:
- if (arg4 | arg5)
- return -EINVAL;
- switch (arg2) {
- case PR_MCE_KILL_CLEAR:
- if (arg3 != 0)
- return -EINVAL;
- current->flags &= ~PF_MCE_PROCESS;
- break;
- case PR_MCE_KILL_SET:
- current->flags |= PF_MCE_PROCESS;
- if (arg3 == PR_MCE_KILL_EARLY)
- current->flags |= PF_MCE_EARLY;
- else if (arg3 == PR_MCE_KILL_LATE)
- current->flags &= ~PF_MCE_EARLY;
- else if (arg3 == PR_MCE_KILL_DEFAULT)
- current->flags &=
- ~(PF_MCE_EARLY|PF_MCE_PROCESS);
- else
- return -EINVAL;
- break;
- default:
+ else
+ current->timer_slack_ns = arg2;
+ break;
+ case PR_MCE_KILL:
+ if (arg4 | arg5)
+ return -EINVAL;
+ switch (arg2) {
+ case PR_MCE_KILL_CLEAR:
+ if (arg3 != 0)
return -EINVAL;
- }
+ current->flags &= ~PF_MCE_PROCESS;
break;
- case PR_MCE_KILL_GET:
- if (arg2 | arg3 | arg4 | arg5)
- return -EINVAL;
- if (current->flags & PF_MCE_PROCESS)
- error = (current->flags & PF_MCE_EARLY) ?
- PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+ case PR_MCE_KILL_SET:
+ current->flags |= PF_MCE_PROCESS;
+ if (arg3 == PR_MCE_KILL_EARLY)
+ current->flags |= PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_LATE)
+ current->flags &= ~PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_DEFAULT)
+ current->flags &=
+ ~(PF_MCE_EARLY|PF_MCE_PROCESS);
else
- error = PR_MCE_KILL_DEFAULT;
- break;
- case PR_SET_MM:
- error = prctl_set_mm(arg2, arg3, arg4, arg5);
- break;
- case PR_GET_TID_ADDRESS:
- error = prctl_get_tid_address(me, (int __user **)arg2);
- break;
- case PR_SET_CHILD_SUBREAPER:
- me->signal->is_child_subreaper = !!arg2;
- break;
- case PR_GET_CHILD_SUBREAPER:
- error = put_user(me->signal->is_child_subreaper,
- (int __user *) arg2);
- break;
- case PR_SET_NO_NEW_PRIVS:
- if (arg2 != 1 || arg3 || arg4 || arg5)
return -EINVAL;
-
- current->no_new_privs = 1;
break;
- case PR_GET_NO_NEW_PRIVS:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- return current->no_new_privs ? 1 : 0;
default:
- error = -EINVAL;
- break;
+ return -EINVAL;
+ }
+ break;
+ case PR_MCE_KILL_GET:
+ if (arg2 | arg3 | arg4 | arg5)
+ return -EINVAL;
+ if (current->flags & PF_MCE_PROCESS)
+ error = (current->flags & PF_MCE_EARLY) ?
+ PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+ else
+ error = PR_MCE_KILL_DEFAULT;
+ break;
+ case PR_SET_MM:
+ error = prctl_set_mm(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_TID_ADDRESS:
+ error = prctl_get_tid_address(me, (int __user **)arg2);
+ break;
+ case PR_SET_CHILD_SUBREAPER:
+ me->signal->is_child_subreaper = !!arg2;
+ break;
+ case PR_GET_CHILD_SUBREAPER:
+ error = put_user(me->signal->is_child_subreaper,
+ (int __user *)arg2);
+ break;
+ case PR_SET_NO_NEW_PRIVS:
+ if (arg2 != 1 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ current->no_new_privs = 1;
+ break;
+ case PR_GET_NO_NEW_PRIVS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return current->no_new_privs ? 1 : 0;
+ default:
+ error = -EINVAL;
+ break;
}
return error;
}
@@ -2184,11 +2185,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static void argv_cleanup(struct subprocess_info *info)
-{
- argv_free(info->argv);
-}
-
static int __orderly_poweroff(void)
{
int argc;
@@ -2208,9 +2204,8 @@ static int __orderly_poweroff(void)
}
ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
- NULL, argv_cleanup, NULL);
- if (ret == -ENOMEM)
- argv_free(argv);
+ NULL, NULL, NULL);
+ argv_free(argv);
return ret;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index dbff751e4086..395084d4ce16 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff);
cond_syscall(sys_kexec_load);
cond_syscall(compat_sys_kexec_load);
cond_syscall(sys_init_module);
+cond_syscall(sys_finit_module);
cond_syscall(sys_delete_module);
cond_syscall(sys_socketpair);
cond_syscall(sys_bind);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 33f71f37267e..afc1dc60f3f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
#include <linux/kmod.h>
#include <linux/capability.h>
#include <linux/binfmts.h>
+#include <linux/sched/sysctl.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -104,7 +105,6 @@ extern char core_pattern[];
extern unsigned int core_pipe_limit;
#endif
extern int pid_max;
-extern int min_free_kbytes;
extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
@@ -157,14 +157,20 @@ extern int sysctl_tsb_ratio;
#ifdef __hppa__
extern int pwrsw_enabled;
+#endif
+
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
extern int unaligned_enabled;
#endif
#ifdef CONFIG_IA64
-extern int no_unaligned_warning;
extern int unaligned_dump_stack;
#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
+extern int no_unaligned_warning;
+#endif
+
#ifdef CONFIG_PROC_SYSCTL
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -256,9 +262,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
@@ -301,6 +309,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
+#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
@@ -347,7 +356,45 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_scan_delay_ms",
+ .data = &sysctl_numa_balancing_scan_delay,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_min_ms",
+ .data = &sysctl_numa_balancing_scan_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_reset",
+ .data = &sysctl_numa_balancing_scan_period_reset,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_max_ms",
+ .data = &sysctl_numa_balancing_scan_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_size_mb",
+ .data = &sysctl_numa_balancing_scan_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
@@ -362,6 +409,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rt_handler,
},
+ {
+ .procname = "sched_rr_timeslice_ms",
+ .data = &sched_rr_timeslice,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rr_handler,
+ },
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
@@ -504,6 +558,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
{
.procname = "unaligned-trap",
.data = &unaligned_enabled,
@@ -870,7 +926,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_doulongvec_minmax,
},
#endif
-#ifdef CONFIG_IA64
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
{
.procname = "ignore-unaligned-usertrap",
.data = &no_unaligned_warning,
@@ -878,6 +934,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
+#ifdef CONFIG_IA64
{
.procname = "unaligned-dump-stack",
.data = &unaligned_dump_stack,
@@ -1965,7 +2023,7 @@ static int proc_taint(struct ctl_table *table, int write,
int i;
for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
if ((tmptaint >> i) & 1)
- add_taint(i);
+ add_taint(i, LOCKDEP_STILL_OK);
}
}
@@ -2042,7 +2100,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
static void validate_coredump_safety(void)
{
#ifdef CONFIG_COREDUMP
- if (suid_dumpable == SUID_DUMPABLE_SAFE &&
+ if (suid_dumpable == SUID_DUMP_ROOT &&
core_pattern[0] != '/' && core_pattern[0] != '|') {
printk(KERN_WARNING "Unsafe core_pattern used with "\
"suid_dumpable=2. Pipe handler or fully qualified "\
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 65bdcf198d4e..ebf72358e86a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
{ CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
{ CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
{ CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
- { CTL_INT, NET_TCP_ABC, "tcp_abc" },
{ CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
{ CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
{ CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
@@ -971,7 +970,6 @@ out:
static ssize_t bin_intvec(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t copied = 0;
char *buffer;
ssize_t result;
@@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file,
if (oldval && oldlen) {
unsigned __user *vec = oldval;
size_t length = oldlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buffer, BUFSZ - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buffer, BUFSZ - 1);
if (result < 0)
goto out_kfree;
@@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file,
if (newval && newlen) {
unsigned __user *vec = newval;
size_t length = newlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
@@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file,
str += snprintf(str, end - str, "%lu\t", value);
}
- set_fs(KERNEL_DS);
- result = vfs_write(file, buffer, str - buffer, &pos);
- set_fs(old_fs);
+ result = kernel_write(file, buffer, str - buffer, 0);
if (result < 0)
goto out_kfree;
}
@@ -1049,7 +1041,6 @@ out:
static ssize_t bin_ulongvec(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t copied = 0;
char *buffer;
ssize_t result;
@@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file,
if (oldval && oldlen) {
unsigned long __user *vec = oldval;
size_t length = oldlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buffer, BUFSZ - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buffer, BUFSZ - 1);
if (result < 0)
goto out_kfree;
@@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file,
if (newval && newlen) {
unsigned long __user *vec = newval;
size_t length = newlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
@@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file,
str += snprintf(str, end - str, "%lu\t", value);
}
- set_fs(KERNEL_DS);
- result = vfs_write(file, buffer, str - buffer, &pos);
- set_fs(old_fs);
+ result = kernel_write(file, buffer, str - buffer, 0);
if (result < 0)
goto out_kfree;
}
@@ -1127,19 +1112,15 @@ out:
static ssize_t bin_uuid(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t result, copied = 0;
/* Only supports reads */
if (oldval && oldlen) {
- loff_t pos = 0;
char buf[40], *str = buf;
unsigned char uuid[16];
int i;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buf, sizeof(buf) - 1);
if (result < 0)
goto out;
@@ -1175,18 +1156,14 @@ out:
static ssize_t bin_dn_node_address(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t result, copied = 0;
if (oldval && oldlen) {
- loff_t pos = 0;
char buf[15], *nodep;
unsigned long area, node;
__le16 dnaddr;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buf, sizeof(buf) - 1);
if (result < 0)
goto out;
@@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file,
/* Convert the decnet address to binary */
result = -EIO;
- nodep = strchr(buf, '.') + 1;
+ nodep = strchr(buf, '.');
if (!nodep)
goto out;
+ ++nodep;
area = simple_strtoul(buf, NULL, 10);
node = simple_strtoul(nodep, NULL, 10);
@@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file,
}
if (newval && newlen) {
- loff_t pos = 0;
__le16 dnaddr;
char buf[15];
int len;
@@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file,
le16_to_cpu(dnaddr) >> 10,
le16_to_cpu(dnaddr) & 0x3ff);
- set_fs(KERNEL_DS);
- result = vfs_write(file, buf, len, &pos);
- set_fs(old_fs);
+ result = kernel_write(file, buf, len, 0);
if (result < 0)
goto out;
}
@@ -1344,7 +1319,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
goto out_putname;
}
- mnt = current->nsproxy->pid_ns->proc_mnt;
+ mnt = task_active_pid_ns(current)->proc_mnt;
file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
result = PTR_ERR(file);
if (IS_ERR(file))
diff --git a/kernel/time.c b/kernel/time.c
index d226c6a3fd28..f8342a41efa6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
}
/*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+
+/*
* Adjust the time obtained from the CMOS to be UTC time instead of
* local time.
*
@@ -135,6 +141,8 @@ static inline void warp_clock(void)
struct timespec adjust;
adjust = current_kernel_time();
+ if (sys_tz.tz_minuteswest != 0)
+ persistent_clock_is_local = 1;
adjust.tv_sec += sys_tz.tz_minuteswest * 60;
do_settimeofday(&adjust);
}
@@ -232,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time);
* Avoid unnecessary multiplications/divisions in the
* two most common HZ cases:
*/
-inline unsigned int jiffies_to_msecs(const unsigned long j)
+unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
return (MSEC_PER_SEC / HZ) * j;
@@ -248,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_msecs);
-inline unsigned int jiffies_to_usecs(const unsigned long j)
+unsigned int jiffies_to_usecs(const unsigned long j)
{
#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0db1261..24510d84efd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
config ARCH_CLOCKSOURCE_DATA
bool
+# Platforms has a persistent clock
+config ALWAYS_USE_PERSISTENT_CLOCK
+ bool
+ default n
+
# Timekeeping vsyscall support
config GENERIC_TIME_VSYSCALL
bool
@@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD
default y
depends on GENERIC_CLOCKEVENTS
+# Architecture can handle broadcast in a driver-agnostic way
+config ARCH_HAS_TICK_BROADCAST
+ bool
+
# Clockevents broadcasting infrastructure
config GENERIC_CLOCKEVENTS_BROADCAST
bool
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0d977c..c6d6400ee137 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev,
clockevents_config(dev, freq);
clockevents_register_device(dev);
}
+EXPORT_SYMBOL_GPL(clockevents_config_and_register);
/**
* clockevents_update_freq - Update frequency and reprogram a clock event device.
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4d669b..072bb066bb7d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,6 +15,7 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/rtc.h>
#include "tick-internal.h"
@@ -22,7 +23,7 @@
* NTP timekeeping variables:
*/
-DEFINE_SPINLOCK(ntp_lock);
+DEFINE_RAW_SPINLOCK(ntp_lock);
/* USER_HZ period (usecs): */
@@ -347,7 +348,7 @@ void ntp_clear(void)
{
unsigned long flags;
- spin_lock_irqsave(&ntp_lock, flags);
+ raw_spin_lock_irqsave(&ntp_lock, flags);
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
@@ -361,7 +362,7 @@ void ntp_clear(void)
/* Clear PPS state variables */
pps_clear();
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
}
@@ -371,9 +372,9 @@ u64 ntp_tick_length(void)
unsigned long flags;
s64 ret;
- spin_lock_irqsave(&ntp_lock, flags);
+ raw_spin_lock_irqsave(&ntp_lock, flags);
ret = tick_length;
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
return ret;
}
@@ -394,7 +395,7 @@ int second_overflow(unsigned long secs)
int leap = 0;
unsigned long flags;
- spin_lock_irqsave(&ntp_lock, flags);
+ raw_spin_lock_irqsave(&ntp_lock, flags);
/*
* Leap second processing. If in leap-insert state at the end of the
@@ -478,13 +479,12 @@ int second_overflow(unsigned long secs)
time_adjust = 0;
out:
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
return leap;
}
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
-
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
static void sync_cmos_clock(struct work_struct *work);
static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work)
}
getnstimeofday(&now);
- if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
- fail = update_persistent_clock(now);
+ if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
+ struct timespec adjust = now;
+
+ fail = -ENODEV;
+ if (persistent_clock_is_local)
+ adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+ fail = update_persistent_clock(adjust);
+#endif
+#ifdef CONFIG_RTC_SYSTOHC
+ if (fail == -ENODEV)
+ fail = rtc_set_ntp_time(adjust);
+#endif
+ }
next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
if (next.tv_nsec <= 0)
next.tv_nsec += NSEC_PER_SEC;
- if (!fail)
+ if (!fail || fail == -ENODEV)
next.tv_sec = 659;
else
next.tv_sec = 0;
@@ -660,7 +672,7 @@ int do_adjtimex(struct timex *txc)
getnstimeofday(&ts);
- spin_lock_irq(&ntp_lock);
+ raw_spin_lock_irq(&ntp_lock);
if (txc->modes & ADJ_ADJTIME) {
long save_adjust = time_adjust;
@@ -702,7 +714,7 @@ int do_adjtimex(struct timex *txc)
/* fill PPS status fields */
pps_fill_timex(txc);
- spin_unlock_irq(&ntp_lock);
+ raw_spin_unlock_irq(&ntp_lock);
txc->time.tv_sec = ts.tv_sec;
txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
pts_norm = pps_normalize_ts(*phase_ts);
- spin_lock_irqsave(&ntp_lock, flags);
+ raw_spin_lock_irqsave(&ntp_lock, flags);
/* clear the error bits, they will be set again if needed */
time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
* just start the frequency interval */
if (unlikely(pps_fbase.tv_sec == 0)) {
pps_fbase = *raw_ts;
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
return;
}
@@ -928,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
time_status |= STA_PPSJITTER;
/* restart the frequency calibration interval */
pps_fbase = *raw_ts;
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
pr_err("hardpps: PPSJITTER: bad pulse\n");
return;
}
@@ -945,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
hardpps_update_phase(pts_norm.nsec);
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
}
EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e2..2fb8cb88df8d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,6 +18,7 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
+#include <linux/smp.h>
#include "tick-internal.h"
@@ -86,6 +87,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
return (dev && tick_broadcast_device.evtdev == dev);
}
+static void err_broadcast(const struct cpumask *mask)
+{
+ pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
+}
+
+static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
+{
+ if (!dev->broadcast)
+ dev->broadcast = tick_broadcast;
+ if (!dev->broadcast) {
+ pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
+ dev->name);
+ dev->broadcast = err_broadcast;
+ }
+}
+
/*
* Check, if the device is disfunctional and a place holder, which
* needs to be handled by the broadcast device.
@@ -105,6 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
*/
if (!tick_device_is_functional(dev)) {
dev->event_handler = tick_handle_periodic;
+ tick_device_setup_broadcast_func(dev);
cpumask_set_cpu(cpu, tick_get_broadcast_mask());
tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
ret = 1;
@@ -116,15 +134,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
*/
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
int cpu = smp_processor_id();
-
cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
tick_broadcast_clear_oneshot(cpu);
+ } else {
+ tick_device_setup_broadcast_func(dev);
}
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
return ret;
}
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+int tick_receive_broadcast(void)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ struct clock_event_device *evt = td->evtdev;
+
+ if (!evt)
+ return -ENODEV;
+
+ if (!evt->event_handler)
+ return -EINVAL;
+
+ evt->event_handler(evt);
+ return 0;
+}
+#endif
+
/*
* Broadcast the event to the cpus, which are set in the mask (mangled).
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d58e552d9fd1..a19a39952c1b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <linux/irq_work.h>
#include <asm/irq_regs.h>
@@ -28,7 +29,7 @@
/*
* Per cpu nohz control structure
*/
-static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
/*
* The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -331,8 +332,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
time_delta = timekeeping_max_deferment();
} while (read_seqretry(&jiffies_lock, seq));
- if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
- arch_needs_cpu(cpu)) {
+ if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
+ arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
} else {
@@ -553,6 +554,7 @@ void tick_nohz_idle_enter(void)
local_irq_enable();
}
+EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
/**
* tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -631,8 +633,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;
+
+ if (vtime_accounting_enabled())
+ return;
/*
* We stopped the tick in idle. Update process times would miss the
* time we slept as update_process_times does only a 1 tick
@@ -681,6 +686,7 @@ void tick_nohz_idle_exit(void)
local_irq_enable();
}
+EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
{
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb0db3f..9a0bc98fbe1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -29,6 +29,9 @@ static struct timekeeper timekeeper;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
+/* Flag for if there is a persistent clock on this platform */
+bool __read_mostly persistent_clock_exist = false;
+
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
@@ -135,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
}
/* Timekeeper helper functions. */
+
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+u32 (*arch_gettimeoffset)(void);
+
+u32 get_arch_timeoffset(void)
+{
+ if (likely(arch_gettimeoffset))
+ return arch_gettimeoffset();
+ return 0;
+}
+#else
+static inline u32 get_arch_timeoffset(void) { return 0; }
+#endif
+
static inline s64 timekeeping_get_ns(struct timekeeper *tk)
{
cycle_t cycle_now, cycle_delta;
@@ -151,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)
nsec = cycle_delta * tk->mult + tk->xtime_nsec;
nsec >>= tk->shift;
- /* If arch requires, add in gettimeoffset() */
- return nsec + arch_gettimeoffset();
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + get_arch_timeoffset();
}
static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
@@ -171,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
/* convert delta to nanoseconds. */
nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
- /* If arch requires, add in gettimeoffset() */
- return nsec + arch_gettimeoffset();
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + get_arch_timeoffset();
}
static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
@@ -254,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk->xtime_nsec += cycle_delta * tk->mult;
- /* If arch requires, add in gettimeoffset() */
- tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
+ /* If arch requires, add in get_arch_timeoffset() */
+ tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
tk_normalize_xtime(tk);
@@ -264,19 +281,18 @@ static void timekeeping_forward_now(struct timekeeper *tk)
}
/**
- * getnstimeofday - Returns the time of day in a timespec
+ * __getnstimeofday - Returns the time of day in a timespec.
* @ts: pointer to the timespec to be set
*
- * Returns the time of day in a timespec.
+ * Updates the time of day in the timespec.
+ * Returns 0 on success, or -ve when suspended (timespec will be undefined).
*/
-void getnstimeofday(struct timespec *ts)
+int __getnstimeofday(struct timespec *ts)
{
struct timekeeper *tk = &timekeeper;
unsigned long seq;
s64 nsecs = 0;
- WARN_ON(timekeeping_suspended);
-
do {
seq = read_seqbegin(&tk->lock);
@@ -287,6 +303,26 @@ void getnstimeofday(struct timespec *ts)
ts->tv_nsec = 0;
timespec_add_ns(ts, nsecs);
+
+ /*
+ * Do not bail out early, in case there were callers still using
+ * the value, even in the face of the WARN_ON.
+ */
+ if (unlikely(timekeeping_suspended))
+ return -EAGAIN;
+ return 0;
+}
+EXPORT_SYMBOL(__getnstimeofday);
+
+/**
+ * getnstimeofday - Returns the time of day in a timespec.
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec (WARN if suspended).
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ WARN_ON(__getnstimeofday(ts));
}
EXPORT_SYMBOL(getnstimeofday);
@@ -640,12 +676,14 @@ void __init timekeeping_init(void)
struct timespec now, boot, tmp;
read_persistent_clock(&now);
+
if (!timespec_valid_strict(&now)) {
pr_warn("WARNING: Persistent clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
now.tv_sec = 0;
now.tv_nsec = 0;
- }
+ } else if (now.tv_sec || now.tv_nsec)
+ persistent_clock_exist = true;
read_boot_clock(&boot);
if (!timespec_valid_strict(&boot)) {
@@ -718,11 +756,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
{
struct timekeeper *tk = &timekeeper;
unsigned long flags;
- struct timespec ts;
- /* Make sure we don't set the clock twice */
- read_persistent_clock(&ts);
- if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
+ /*
+ * Make sure we don't set the clock twice, as timekeeping_resume()
+ * already did it
+ */
+ if (has_persistent_clock())
return;
write_seqlock_irqsave(&tk->lock, flags);
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc
new file mode 100644
index 000000000000..511bdf2cafda
--- /dev/null
+++ b/kernel/timeconst.bc
@@ -0,0 +1,108 @@
+scale=0
+
+define gcd(a,b) {
+ auto t;
+ while (b) {
+ t = b;
+ b = a % b;
+ a = t;
+ }
+ return a;
+}
+
+/* Division by reciprocal multiplication. */
+define fmul(b,n,d) {
+ return (2^b*n+d-1)/d;
+}
+
+/* Adjustment factor when a ceiling value is used. Use as:
+ (imul * n) + (fmulxx * n + fadjxx) >> xx) */
+define fadj(b,n,d) {
+ auto v;
+ d = d/gcd(n,d);
+ v = 2^b*(d-1)/d;
+ return v;
+}
+
+/* Compute the appropriate mul/adj values as well as a shift count,
+ which brings the mul value into the range 2^b-1 <= x < 2^b. Such
+ a shift value will be correct in the signed integer range and off
+ by at most one in the upper half of the unsigned range. */
+define fmuls(b,n,d) {
+ auto s, m;
+ for (s = 0; 1; s++) {
+ m = fmul(s,n,d);
+ if (m >= 2^(b-1))
+ return s;
+ }
+ return 0;
+}
+
+define timeconst(hz) {
+ print "/* Automatically generated by kernel/timeconst.bc */\n"
+ print "/* Time conversion constants for HZ == ", hz, " */\n"
+ print "\n"
+
+ print "#ifndef KERNEL_TIMECONST_H\n"
+ print "#define KERNEL_TIMECONST_H\n\n"
+
+ print "#include <linux/param.h>\n"
+ print "#include <linux/types.h>\n\n"
+
+ print "#if HZ != ", hz, "\n"
+ print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+ print "#endif\n\n"
+
+ if (hz < 2) {
+ print "#error Totally bogus HZ value!\n"
+ } else {
+ s=fmuls(32,1000,hz)
+ obase=16
+ print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
+ print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
+ obase=10
+ print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
+
+ s=fmuls(32,hz,1000)
+ obase=16
+ print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
+ print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
+ obase=10
+ print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
+
+ obase=10
+ cd=gcd(hz,1000)
+ print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
+ print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
+ print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
+ print "\n"
+
+ s=fmuls(32,1000000,hz)
+ obase=16
+ print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
+ print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
+ obase=10
+ print "#define HZ_TO_USEC_SHR32\t", s, "\n"
+
+ s=fmuls(32,hz,1000000)
+ obase=16
+ print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
+ print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
+ obase=10
+ print "#define USEC_TO_HZ_SHR32\t", s, "\n"
+
+ obase=10
+ cd=gcd(hz,1000000)
+ print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
+ print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
+ print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+ print "\n"
+
+ print "#endif /* KERNEL_TIMECONST_H */\n"
+ }
+ halt
+}
+
+timeconst(hz)
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
deleted file mode 100644
index eb51d76e058a..000000000000
--- a/kernel/timeconst.pl
+++ /dev/null
@@ -1,378 +0,0 @@
-#!/usr/bin/perl
-# -----------------------------------------------------------------------
-#
-# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
-#
-# This file is part of the Linux kernel, and is made available under
-# the terms of the GNU General Public License version 2 or (at your
-# option) any later version; incorporated herein by reference.
-#
-# -----------------------------------------------------------------------
-#
-
-#
-# Usage: timeconst.pl HZ > timeconst.h
-#
-
-# Precomputed values for systems without Math::BigInt
-# Generated by:
-# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
-%canned_values = (
- 24 => [
- '0xa6aaaaab','0x2aaaaaa',26,
- 125,3,
- '0xc49ba5e4','0x1fbe76c8b4',37,
- 3,125,
- '0xa2c2aaab','0xaaaa',16,
- 125000,3,
- '0xc9539b89','0x7fffbce4217d',47,
- 3,125000,
- ], 32 => [
- '0xfa000000','0x6000000',27,
- 125,4,
- '0x83126e98','0xfdf3b645a',36,
- 4,125,
- '0xf4240000','0x0',17,
- 31250,1,
- '0x8637bd06','0x3fff79c842fa',46,
- 1,31250,
- ], 48 => [
- '0xa6aaaaab','0x6aaaaaa',27,
- 125,6,
- '0xc49ba5e4','0xfdf3b645a',36,
- 6,125,
- '0xa2c2aaab','0x15555',17,
- 62500,3,
- '0xc9539b89','0x3fffbce4217d',46,
- 3,62500,
- ], 64 => [
- '0xfa000000','0xe000000',28,
- 125,8,
- '0x83126e98','0x7ef9db22d',35,
- 8,125,
- '0xf4240000','0x0',18,
- 15625,1,
- '0x8637bd06','0x1fff79c842fa',45,
- 1,15625,
- ], 100 => [
- '0xa0000000','0x0',28,
- 10,1,
- '0xcccccccd','0x733333333',35,
- 1,10,
- '0x9c400000','0x0',18,
- 10000,1,
- '0xd1b71759','0x1fff2e48e8a7',45,
- 1,10000,
- ], 122 => [
- '0x8325c53f','0xfbcda3a',28,
- 500,61,
- '0xf9db22d1','0x7fbe76c8b',35,
- 61,500,
- '0x8012e2a0','0x3ef36',18,
- 500000,61,
- '0xffda4053','0x1ffffbce4217',45,
- 61,500000,
- ], 128 => [
- '0xfa000000','0x1e000000',29,
- 125,16,
- '0x83126e98','0x3f7ced916',34,
- 16,125,
- '0xf4240000','0x40000',19,
- 15625,2,
- '0x8637bd06','0xfffbce4217d',44,
- 2,15625,
- ], 200 => [
- '0xa0000000','0x0',29,
- 5,1,
- '0xcccccccd','0x333333333',34,
- 1,5,
- '0x9c400000','0x0',19,
- 5000,1,
- '0xd1b71759','0xfff2e48e8a7',44,
- 1,5000,
- ], 250 => [
- '0x80000000','0x0',29,
- 4,1,
- '0x80000000','0x180000000',33,
- 1,4,
- '0xfa000000','0x0',20,
- 4000,1,
- '0x83126e98','0x7ff7ced9168',43,
- 1,4000,
- ], 256 => [
- '0xfa000000','0x3e000000',30,
- 125,32,
- '0x83126e98','0x1fbe76c8b',33,
- 32,125,
- '0xf4240000','0xc0000',20,
- 15625,4,
- '0x8637bd06','0x7ffde7210be',43,
- 4,15625,
- ], 300 => [
- '0xd5555556','0x2aaaaaaa',30,
- 10,3,
- '0x9999999a','0x1cccccccc',33,
- 3,10,
- '0xd0555556','0xaaaaa',20,
- 10000,3,
- '0x9d495183','0x7ffcb923a29',43,
- 3,10000,
- ], 512 => [
- '0xfa000000','0x7e000000',31,
- 125,64,
- '0x83126e98','0xfdf3b645',32,
- 64,125,
- '0xf4240000','0x1c0000',21,
- 15625,8,
- '0x8637bd06','0x3ffef39085f',42,
- 8,15625,
- ], 1000 => [
- '0x80000000','0x0',31,
- 1,1,
- '0x80000000','0x0',31,
- 1,1,
- '0xfa000000','0x0',22,
- 1000,1,
- '0x83126e98','0x1ff7ced9168',41,
- 1,1000,
- ], 1024 => [
- '0xfa000000','0xfe000000',32,
- 125,128,
- '0x83126e98','0x7ef9db22',31,
- 128,125,
- '0xf4240000','0x3c0000',22,
- 15625,16,
- '0x8637bd06','0x1fff79c842f',41,
- 16,15625,
- ], 1200 => [
- '0xd5555556','0xd5555555',32,
- 5,6,
- '0x9999999a','0x66666666',31,
- 6,5,
- '0xd0555556','0x2aaaaa',22,
- 2500,3,
- '0x9d495183','0x1ffcb923a29',41,
- 3,2500,
- ]
-);
-
-$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
-
-sub bint($)
-{
- my($x) = @_;
- return Math::BigInt->new($x);
-}
-
-#
-# Constants for division by reciprocal multiplication.
-# (bits, numerator, denominator)
-#
-sub fmul($$$)
-{
- my ($b,$n,$d) = @_;
-
- $n = bint($n);
- $d = bint($d);
-
- return scalar (($n << $b)+$d-bint(1))/$d;
-}
-
-sub fadj($$$)
-{
- my($b,$n,$d) = @_;
-
- $n = bint($n);
- $d = bint($d);
-
- $d = $d/bgcd($n, $d);
- return scalar (($d-bint(1)) << $b)/$d;
-}
-
-sub fmuls($$$) {
- my($b,$n,$d) = @_;
- my($s,$m);
- my($thres) = bint(1) << ($b-1);
-
- $n = bint($n);
- $d = bint($d);
-
- for ($s = 0; 1; $s++) {
- $m = fmul($s,$n,$d);
- return $s if ($m >= $thres);
- }
- return 0;
-}
-
-# Generate a hex value if the result fits in 64 bits;
-# otherwise skip.
-sub bignum_hex($) {
- my($x) = @_;
- my $s = $x->as_hex();
-
- return (length($s) > 18) ? undef : $s;
-}
-
-# Provides mul, adj, and shr factors for a specific
-# (bit, time, hz) combination
-sub muladj($$$) {
- my($b, $t, $hz) = @_;
- my $s = fmuls($b, $t, $hz);
- my $m = fmul($s, $t, $hz);
- my $a = fadj($s, $t, $hz);
- return (bignum_hex($m), bignum_hex($a), $s);
-}
-
-# Provides numerator, denominator values
-sub numden($$) {
- my($n, $d) = @_;
- my $g = bgcd($n, $d);
- return ($n/$g, $d/$g);
-}
-
-# All values for a specific (time, hz) combo
-sub conversions($$) {
- my ($t, $hz) = @_;
- my @val = ();
-
- # HZ_TO_xx
- push(@val, muladj(32, $t, $hz));
- push(@val, numden($t, $hz));
-
- # xx_TO_HZ
- push(@val, muladj(32, $hz, $t));
- push(@val, numden($hz, $t));
-
- return @val;
-}
-
-sub compute_values($) {
- my($hz) = @_;
- my @val = ();
- my $s, $m, $a, $g;
-
- if (!$has_bigint) {
- die "$0: HZ == $hz not canned and ".
- "Math::BigInt not available\n";
- }
-
- # MSEC conversions
- push(@val, conversions(1000, $hz));
-
- # USEC conversions
- push(@val, conversions(1000000, $hz));
-
- return @val;
-}
-
-sub outputval($$)
-{
- my($name, $val) = @_;
- my $csuf;
-
- if (defined($val)) {
- if ($name !~ /SHR/) {
- $val = "U64_C($val)";
- }
- printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
- }
-}
-
-sub output($@)
-{
- my($hz, @val) = @_;
- my $pfx, $bit, $suf, $s, $m, $a;
-
- print "/* Automatically generated by kernel/timeconst.pl */\n";
- print "/* Conversion constants for HZ == $hz */\n";
- print "\n";
- print "#ifndef KERNEL_TIMECONST_H\n";
- print "#define KERNEL_TIMECONST_H\n";
- print "\n";
-
- print "#include <linux/param.h>\n";
- print "#include <linux/types.h>\n";
-
- print "\n";
- print "#if HZ != $hz\n";
- print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
- print "#endif\n";
- print "\n";
-
- foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
- 'HZ_TO_USEC','USEC_TO_HZ') {
- foreach $bit (32) {
- foreach $suf ('MUL', 'ADJ', 'SHR') {
- outputval("${pfx}_$suf$bit", shift(@val));
- }
- }
- foreach $suf ('NUM', 'DEN') {
- outputval("${pfx}_$suf", shift(@val));
- }
- }
-
- print "\n";
- print "#endif /* KERNEL_TIMECONST_H */\n";
-}
-
-# Pretty-print Perl values
-sub perlvals(@) {
- my $v;
- my @l = ();
-
- foreach $v (@_) {
- if (!defined($v)) {
- push(@l, 'undef');
- } elsif ($v =~ /^0x/) {
- push(@l, "\'".$v."\'");
- } else {
- push(@l, $v.'');
- }
- }
- return join(',', @l);
-}
-
-($hz) = @ARGV;
-
-# Use this to generate the %canned_values structure
-if ($hz eq '--can') {
- shift(@ARGV);
- @hzlist = sort {$a <=> $b} (@ARGV);
-
- print "# Precomputed values for systems without Math::BigInt\n";
- print "# Generated by:\n";
- print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
- print "\%canned_values = (\n";
- my $pf = "\t";
- foreach $hz (@hzlist) {
- my @values = compute_values($hz);
- print "$pf$hz => [\n";
- while (scalar(@values)) {
- my $bit;
- foreach $bit (32) {
- my $m = shift(@values);
- my $a = shift(@values);
- my $s = shift(@values);
- print "\t\t", perlvals($m,$a,$s), ",\n";
- }
- my $n = shift(@values);
- my $d = shift(@values);
- print "\t\t", perlvals($n,$d), ",\n";
- }
- print "\t]";
- $pf = ', ';
- }
- print "\n);\n";
-} else {
- $hz += 0; # Force to number
- if ($hz < 1) {
- die "Usage: $0 HZ\n";
- }
-
- @val = @{$canned_values{$hz}};
- if (!defined(@val)) {
- @val = compute_values($hz);
- }
- output($hz, @val);
-}
-exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d00858482..dbf7a78a1ef1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
@@ -1351,7 +1352,6 @@ void update_process_times(int user_tick)
account_process_tick(p, user_tick);
run_local_timers();
rcu_check_callbacks(cpu, user_tick);
- printk_tick();
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_run();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d89335a485f..192473b22799 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
help
See Documentation/trace/ftrace-design.txt
+config HAVE_DYNAMIC_FTRACE_WITH_REGS
+ bool
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -78,21 +81,6 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool
-config EVENT_POWER_TRACING_DEPRECATED
- depends on EVENT_TRACING
- bool "Deprecated power event trace API, to be removed"
- default y
- help
- Provides old power event types:
- C-state/idle accounting events:
- power:power_start
- power:power_end
- and old cpufreq accounting event:
- power:power_frequency
- This is for userspace compatibility
- and will vanish after 5 kernel iterations,
- namely 3.1.
-
config CONTEXT_SWITCH_TRACER
bool
@@ -250,6 +238,16 @@ config FTRACE_SYSCALLS
help
Basic tracer to catch the syscall entry and exit events.
+config TRACER_SNAPSHOT
+ bool "Create a snapshot trace buffer"
+ select TRACER_MAX_TRACE
+ help
+ Allow tracing users to take snapshot of the current buffer using the
+ ftrace interface, e.g.:
+
+ echo 1 > /sys/kernel/debug/tracing/snapshot
+ cat snapshot
+
config TRACE_BRANCH_PROFILING
bool
select GENERIC_TRACER
@@ -434,6 +432,11 @@ config DYNAMIC_FTRACE
were made. If so, it runs stop_machine (stops all CPUS)
and modifies the code to jump over the call to ftrace.
+config DYNAMIC_FTRACE_WITH_REGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741c..9e5b8c272eec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
return;
local_irq_save(flags);
- buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+ buf = this_cpu_ptr(bt->msg_data);
va_start(args, fmt);
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
@@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore,
struct request_queue *q,
struct request *rq)
{
+ struct blk_trace *bt = q->blk_trace;
+
+ /* if control ever passes through here, it's a request based driver */
+ if (unlikely(bt && !bt->rq_based))
+ bt->rq_based = true;
+
blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
}
@@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore,
blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}
-static void blk_add_trace_bio_complete(void *ignore,
- struct request_queue *q, struct bio *bio,
- int error)
+static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
{
+ struct request_queue *q;
+ struct blk_trace *bt;
+
+ if (!bio->bi_bdev)
+ return;
+
+ q = bdev_get_queue(bio->bi_bdev);
+ bt = q->blk_trace;
+
+ /*
+ * Request based drivers will generate both rq and bio completions.
+ * Ignore bio ones.
+ */
+ if (likely(!bt) || bt->rq_based)
+ return;
+
blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
}
static void blk_add_trace_bio_backmerge(void *ignore,
struct request_queue *q,
+ struct request *rq,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
@@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
static void blk_add_trace_bio_frontmerge(void *ignore,
struct request_queue *q,
+ struct request *rq,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index afd092de45b7..ab25b88aae56 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
#endif
+/*
+ * Traverse the ftrace_global_list, invoking all entries. The reason that we
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism. The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_global_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
+#define do_for_each_ftrace_op(op, list) \
+ op = rcu_dereference_raw(list); \
+ do
+
+/*
+ * Optimized for just a single item in the list (as that is the normal case).
+ */
+#define while_for_each_ftrace_op(op) \
+ while (likely(op = rcu_dereference_raw((op)->next)) && \
+ unlikely((op) != &ftrace_list_end))
+
/**
* ftrace_nr_registered_ops - return number of ops registered
*
@@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)
return cnt;
}
-/*
- * Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw() is that elements removed from this list
- * are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw() calls are needed to handle
- * concurrent insertions into the ftrace_global_list.
- *
- * Silly Alpha and silly pointer-speculation compiler optimizations!
- */
static void
ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
- if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+ int bit;
+
+ bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
+ if (bit < 0)
return;
- trace_recursion_set(TRACE_GLOBAL_BIT);
- op = rcu_dereference_raw(ftrace_global_list); /*see above*/
- while (op != &ftrace_list_end) {
+ do_for_each_ftrace_op(op, ftrace_global_list) {
op->func(ip, parent_ip, op, regs);
- op = rcu_dereference_raw(op->next); /*see above*/
- };
- trace_recursion_clear(TRACE_GLOBAL_BIT);
+ } while_for_each_ftrace_op(op);
+
+ trace_clear_recursion(bit);
}
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@ static void update_global_ops(void)
* registered callers.
*/
if (ftrace_global_list == &ftrace_list_end ||
- ftrace_global_list->next == &ftrace_list_end)
+ ftrace_global_list->next == &ftrace_list_end) {
func = ftrace_global_list->func;
- else
+ /*
+ * As we are calling the function directly.
+ * If it does not have recursion protection,
+ * the function_trace_op needs to be updated
+ * accordingly.
+ */
+ if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
+ global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+ else
+ global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
+ } else {
func = ftrace_global_list_func;
+ /* The list has its own recursion protection. */
+ global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+ }
+
/* If we filter on pids, update to use the pid function */
if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
return -EINVAL;
-#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
/*
* If the ftrace_ops specifies SAVE_REGS, then it only can be used
* if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -736,7 +762,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
{
struct ftrace_profile *rec;
struct hlist_head *hhd;
- struct hlist_node *n;
unsigned long key;
key = hash_long(ip, ftrace_profile_bits);
@@ -745,7 +770,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
if (hlist_empty(hhd))
return NULL;
- hlist_for_each_entry_rcu(rec, n, hhd, node) {
+ hlist_for_each_entry_rcu(rec, hhd, node) {
if (rec->ip == ip)
return rec;
}
@@ -1107,7 +1132,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
unsigned long key;
struct ftrace_func_entry *entry;
struct hlist_head *hhd;
- struct hlist_node *n;
if (ftrace_hash_empty(hash))
return NULL;
@@ -1119,7 +1143,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
hhd = &hash->buckets[key];
- hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
+ hlist_for_each_entry_rcu(entry, hhd, hlist) {
if (entry->ip == ip)
return entry;
}
@@ -1176,7 +1200,7 @@ remove_hash_entry(struct ftrace_hash *hash,
static void ftrace_hash_clear(struct ftrace_hash *hash)
{
struct hlist_head *hhd;
- struct hlist_node *tp, *tn;
+ struct hlist_node *tn;
struct ftrace_func_entry *entry;
int size = 1 << hash->size_bits;
int i;
@@ -1186,7 +1210,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
for (i = 0; i < size; i++) {
hhd = &hash->buckets[i];
- hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist)
free_hash_entry(hash, entry);
}
FTRACE_WARN_ON(hash->count);
@@ -1249,7 +1273,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
{
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
- struct hlist_node *tp;
int size;
int ret;
int i;
@@ -1264,7 +1287,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
ret = add_hash_entry(new_hash, entry->ip);
if (ret < 0)
goto free_hash;
@@ -1290,7 +1313,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
{
struct ftrace_func_entry *entry;
- struct hlist_node *tp, *tn;
+ struct hlist_node *tn;
struct hlist_head *hhd;
struct ftrace_hash *old_hash;
struct ftrace_hash *new_hash;
@@ -1336,7 +1359,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
hhd = &src->buckets[i];
- hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
if (bits > 0)
key = hash_long(entry->ip, bits);
else
@@ -2675,12 +2698,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
}
loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
{
loff_t ret;
if (file->f_mode & FMODE_READ)
- ret = seq_lseek(file, offset, origin);
+ ret = seq_lseek(file, offset, whence);
else
file->f_pos = ret = 1;
@@ -2875,7 +2898,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
{
struct ftrace_func_probe *entry;
struct hlist_head *hhd;
- struct hlist_node *n;
unsigned long key;
key = hash_long(ip, FTRACE_HASH_BITS);
@@ -2891,7 +2913,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
* on the hash. rcu_read_lock is too dangerous here.
*/
preempt_disable_notrace();
- hlist_for_each_entry_rcu(entry, n, hhd, node) {
+ hlist_for_each_entry_rcu(entry, hhd, node) {
if (entry->ip == ip)
entry->ops->func(ip, parent_ip, &entry->data);
}
@@ -3042,7 +3064,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data, int flags)
{
struct ftrace_func_probe *entry;
- struct hlist_node *n, *tmp;
+ struct hlist_node *tmp;
char str[KSYM_SYMBOL_LEN];
int type = MATCH_FULL;
int i, len = 0;
@@ -3065,7 +3087,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
struct hlist_head *hhd = &ftrace_func_hash[i];
- hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+ hlist_for_each_entry_safe(entry, tmp, hhd, node) {
/* break up if statements for readability */
if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
@@ -3970,35 +3992,49 @@ static void ftrace_init_module(struct module *mod,
ftrace_process_locs(mod, start, end);
}
-static int ftrace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify_enter(struct notifier_block *self,
+ unsigned long val, void *data)
{
struct module *mod = data;
- switch (val) {
- case MODULE_STATE_COMING:
+ if (val == MODULE_STATE_COMING)
ftrace_init_module(mod, mod->ftrace_callsites,
mod->ftrace_callsites +
mod->num_ftrace_callsites);
- break;
- case MODULE_STATE_GOING:
+ return 0;
+}
+
+static int ftrace_module_notify_exit(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ if (val == MODULE_STATE_GOING)
ftrace_release_mod(mod);
- break;
- }
return 0;
}
#else
-static int ftrace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify_enter(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return 0;
+}
+static int ftrace_module_notify_exit(struct notifier_block *self,
+ unsigned long val, void *data)
{
return 0;
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_nb = {
- .notifier_call = ftrace_module_notify,
- .priority = 0,
+struct notifier_block ftrace_module_enter_nb = {
+ .notifier_call = ftrace_module_notify_enter,
+ .priority = INT_MAX, /* Run before anything that can use kprobes */
+};
+
+struct notifier_block ftrace_module_exit_nb = {
+ .notifier_call = ftrace_module_notify_exit,
+ .priority = INT_MIN, /* Run after anything that can remove kprobes */
};
extern unsigned long __start_mcount_loc[];
@@ -4032,9 +4068,13 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_nb);
+ ret = register_module_notifier(&ftrace_module_enter_nb);
if (ret)
- pr_warning("Failed to register trace ftrace module notifier\n");
+ pr_warning("Failed to register trace ftrace module enter notifier\n");
+
+ ret = register_module_notifier(&ftrace_module_exit_nb);
+ if (ret)
+ pr_warning("Failed to register trace ftrace module exit notifier\n");
set_ftrace_early_filters();
@@ -4090,14 +4130,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
*/
preempt_disable_notrace();
trace_recursion_set(TRACE_CONTROL_BIT);
- op = rcu_dereference_raw(ftrace_control_list);
- while (op != &ftrace_list_end) {
+ do_for_each_ftrace_op(op, ftrace_control_list) {
if (!ftrace_function_local_disabled(op) &&
ftrace_ops_test(op, ip))
op->func(ip, parent_ip, op, regs);
-
- op = rcu_dereference_raw(op->next);
- };
+ } while_for_each_ftrace_op(op);
trace_recursion_clear(TRACE_CONTROL_BIT);
preempt_enable_notrace();
}
@@ -4112,27 +4149,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
{
struct ftrace_ops *op;
+ int bit;
if (function_trace_stop)
return;
- if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+ bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ if (bit < 0)
return;
- trace_recursion_set(TRACE_INTERNAL_BIT);
/*
* Some of the ops may be dynamically allocated,
* they must be freed after a synchronize_sched().
*/
preempt_disable_notrace();
- op = rcu_dereference_raw(ftrace_ops_list);
- while (op != &ftrace_list_end) {
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
if (ftrace_ops_test(op, ip))
op->func(ip, parent_ip, op, regs);
- op = rcu_dereference_raw(op->next);
- };
+ } while_for_each_ftrace_op(op);
preempt_enable_notrace();
- trace_recursion_clear(TRACE_INTERNAL_BIT);
+ trace_clear_recursion(bit);
}
/*
@@ -4143,8 +4179,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* Archs are to support both the regs and ftrace_ops at the same time.
* If they support ftrace_ops, it is assumed they support regs.
* If call backs want to use regs, they must either check for regs
- * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
- * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
+ * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
+ * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
* set the ARCH_SUPPORT_FTARCE_OPS.
*/
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b223..1c71382b283d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>
-#ifdef EVENT_POWER_TRACING_DEPRECATED
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
-#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514feedcd..6989df2ba194 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,8 +3,10 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/ftrace_event.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
+#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
@@ -21,7 +23,6 @@
#include <linux/fs.h>
#include <asm/local.h>
-#include "trace.h"
static void update_pages_handler(struct work_struct *work);
@@ -177,7 +178,7 @@ void tracing_off_permanent(void)
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
# define RB_FORCE_8BYTE_ALIGNMENT 0
# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
#else
@@ -185,6 +186,8 @@ void tracing_off_permanent(void)
# define RB_ARCH_ALIGNMENT 8U
#endif
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
+
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -333,7 +336,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
- unsigned char data[]; /* data of buffer page */
+ unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
/*
@@ -2432,41 +2435,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
#ifdef CONFIG_TRACING
-#define TRACE_RECURSIVE_DEPTH 16
+/*
+ * The lock and unlock are done within a preempt disable section.
+ * The current_context per_cpu variable can only be modified
+ * by the current task between lock and unlock. But it can
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ * bit 0 = NMI context
+ * bit 1 = IRQ context
+ * bit 2 = SoftIRQ context
+ * bit 3 = normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ * 101 - 1 = 100
+ * 101 & 100 = 100 (clearing bit zero)
+ *
+ * 1010 - 1 = 1001
+ * 1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
+ */
+static DEFINE_PER_CPU(unsigned int, current_context);
-/* Keep this code out of the fast path cache */
-static noinline void trace_recursive_fail(void)
+static __always_inline int trace_recursive_lock(void)
{
- /* Disable all tracing before we do anything else */
- tracing_off_permanent();
-
- printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
- "HC[%lu]:SC[%lu]:NMI[%lu]\n",
- trace_recursion_buffer(),
- hardirq_count() >> HARDIRQ_SHIFT,
- softirq_count() >> SOFTIRQ_SHIFT,
- in_nmi());
-
- WARN_ON_ONCE(1);
-}
+ unsigned int val = this_cpu_read(current_context);
+ int bit;
-static inline int trace_recursive_lock(void)
-{
- trace_recursion_inc();
+ if (in_interrupt()) {
+ if (in_nmi())
+ bit = 0;
+ else if (in_irq())
+ bit = 1;
+ else
+ bit = 2;
+ } else
+ bit = 3;
- if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
- return 0;
+ if (unlikely(val & (1 << bit)))
+ return 1;
- trace_recursive_fail();
+ val |= (1 << bit);
+ this_cpu_write(current_context, val);
- return -1;
+ return 0;
}
-static inline void trace_recursive_unlock(void)
+static __always_inline void trace_recursive_unlock(void)
{
- WARN_ON_ONCE(!trace_recursion_buffer());
+ unsigned int val = this_cpu_read(current_context);
- trace_recursion_dec();
+ val--;
+ val &= this_cpu_read(current_context);
+ this_cpu_write(current_context, val);
}
#else
@@ -3067,6 +3105,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
/**
+ * ring_buffer_read_events_cpu - get the number of events successfully read
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of events read
+ */
+unsigned long
+ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->read;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
+
+/**
* ring_buffer_entries - get the number of entries in a buffer
* @buffer: The ring buffer
*
@@ -3425,7 +3481,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
/* check for end of page padding */
if ((iter->head >= rb_page_size(iter->head_page)) &&
(iter->head_page != cpu_buffer->commit_page))
- rb_advance_iter(iter);
+ rb_inc_iter(iter);
}
static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 61e081b4ba11..c2e2c2310374 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
+#include <linux/sched/rt.h>
#include "trace.h"
#include "trace_output.h"
@@ -249,7 +250,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
static struct tracer *trace_types __read_mostly;
/* current_trace points to the tracer that is currently active */
-static struct tracer *current_trace __read_mostly;
+static struct tracer *current_trace __read_mostly = &nop_trace;
/*
* trace_types_lock is used to protect the trace_types list.
@@ -709,10 +710,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
WARN_ON_ONCE(!irqs_disabled());
- if (!current_trace->use_max_tr) {
- WARN_ON_ONCE(1);
+
+ if (!current_trace->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(current_trace != &nop_trace);
return;
}
+
arch_spin_lock(&ftrace_max_lock);
tr->buffer = max_tr.buffer;
@@ -739,10 +743,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
WARN_ON_ONCE(!irqs_disabled());
- if (!current_trace->use_max_tr) {
- WARN_ON_ONCE(1);
+ if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
return;
- }
arch_spin_lock(&ftrace_max_lock);
@@ -862,10 +864,13 @@ int register_tracer(struct tracer *type)
current_trace = type;
- /* If we expanded the buffers, make sure the max is expanded too */
- if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, trace_buf_size,
- RING_BUFFER_ALL_CPUS);
+ if (type->use_max_tr) {
+ /* If we expanded the buffers, make sure the max is expanded too */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(max_tr.buffer, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
+ type->allocated_snapshot = true;
+ }
/* the test is responsible for initializing and enabling */
pr_info("Testing tracer %s: ", type->name);
@@ -881,10 +886,14 @@ int register_tracer(struct tracer *type)
/* Only reset on passing, to avoid touching corrupted buffers */
tracing_reset_online_cpus(tr);
- /* Shrink the max buffer again */
- if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, 1,
- RING_BUFFER_ALL_CPUS);
+ if (type->use_max_tr) {
+ type->allocated_snapshot = false;
+
+ /* Shrink the max buffer again */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(max_tr.buffer, 1,
+ RING_BUFFER_ALL_CPUS);
+ }
printk(KERN_CONT "PASSED\n");
}
@@ -922,6 +931,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
{
struct ring_buffer *buffer = tr->buffer;
+ if (!buffer)
+ return;
+
ring_buffer_record_disable(buffer);
/* Make sure all commits have finished */
@@ -936,6 +948,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)
struct ring_buffer *buffer = tr->buffer;
int cpu;
+ if (!buffer)
+ return;
+
ring_buffer_record_disable(buffer);
/* Make sure all commits have finished */
@@ -1167,7 +1182,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
- entry->padding = 0;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1335,7 +1349,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
preempt_disable_notrace();
- use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+ use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
/*
* We don't need any atomic variables, just a barrier.
* If an interrupt comes in, we don't care, because it would
@@ -1389,7 +1403,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
out:
/* Again, don't let gcc optimize things here */
barrier();
- __get_cpu_var(ftrace_stack_reserve)--;
+ __this_cpu_dec(ftrace_stack_reserve);
preempt_enable_notrace();
}
@@ -1517,7 +1531,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;
static char *get_trace_buf(void)
{
struct trace_buffer_struct *percpu_buffer;
- struct trace_buffer_struct *buffer;
/*
* If we have allocated per cpu buffers, then we do not
@@ -1535,9 +1548,7 @@ static char *get_trace_buf(void)
if (!percpu_buffer)
return NULL;
- buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
-
- return buffer->buffer;
+ return this_cpu_ptr(&percpu_buffer->buffer[0]);
}
static int alloc_percpu_trace_buffer(void)
@@ -1942,21 +1953,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
static void *s_start(struct seq_file *m, loff_t *pos)
{
struct trace_iterator *iter = m->private;
- static struct tracer *old_tracer;
int cpu_file = iter->cpu_file;
void *p = NULL;
loff_t l = 0;
int cpu;
- /* copy the tracer to avoid using a global lock all around */
+ /*
+ * copy the tracer to avoid using a global lock all around.
+ * iter->trace is a copy of current_trace, the pointer to the
+ * name may be used instead of a strcmp(), as iter->trace->name
+ * will point to the same string as current_trace->name.
+ */
mutex_lock(&trace_types_lock);
- if (unlikely(old_tracer != current_trace && current_trace)) {
- old_tracer = current_trace;
+ if (unlikely(current_trace && iter->trace->name != current_trace->name))
*iter->trace = *current_trace;
- }
mutex_unlock(&trace_types_lock);
- atomic_inc(&trace_record_cmdline_disabled);
+ if (iter->snapshot && iter->trace->use_max_tr)
+ return ERR_PTR(-EBUSY);
+
+ if (!iter->snapshot)
+ atomic_inc(&trace_record_cmdline_disabled);
if (*pos != iter->pos) {
iter->ent = NULL;
@@ -1995,7 +2012,11 @@ static void s_stop(struct seq_file *m, void *p)
{
struct trace_iterator *iter = m->private;
- atomic_dec(&trace_record_cmdline_disabled);
+ if (iter->snapshot && iter->trace->use_max_tr)
+ return;
+
+ if (!iter->snapshot)
+ atomic_dec(&trace_record_cmdline_disabled);
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
}
@@ -2080,8 +2101,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
unsigned long total;
const char *name = "preemption";
- if (type)
- name = type->name;
+ name = type->name;
get_total_entries(tr, &total, &entries);
@@ -2430,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = {
};
static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file)
+__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
long cpu_file = (long) inode->i_private;
struct trace_iterator *iter;
@@ -2457,16 +2477,16 @@ __tracing_open(struct inode *inode, struct file *file)
if (!iter->trace)
goto fail;
- if (current_trace)
- *iter->trace = *current_trace;
+ *iter->trace = *current_trace;
if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
goto fail;
- if (current_trace && current_trace->print_max)
+ if (current_trace->print_max || snapshot)
iter->tr = &max_tr;
else
iter->tr = &global_trace;
+ iter->snapshot = snapshot;
iter->pos = -1;
mutex_init(&iter->mutex);
iter->cpu_file = cpu_file;
@@ -2483,8 +2503,9 @@ __tracing_open(struct inode *inode, struct file *file)
if (trace_clocks[trace_clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
- /* stop the trace while dumping */
- tracing_stop();
+ /* stop the trace while dumping if we are not opening "snapshot" */
+ if (!iter->snapshot)
+ tracing_stop();
if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
for_each_tracing_cpu(cpu) {
@@ -2547,8 +2568,9 @@ static int tracing_release(struct inode *inode, struct file *file)
if (iter->trace && iter->trace->close)
iter->trace->close(iter);
- /* reenable tracing if it was previously enabled */
- tracing_start();
+ if (!iter->snapshot)
+ /* reenable tracing if it was previously enabled */
+ tracing_start();
mutex_unlock(&trace_types_lock);
mutex_destroy(&iter->mutex);
@@ -2576,7 +2598,7 @@ static int tracing_open(struct inode *inode, struct file *file)
}
if (file->f_mode & FMODE_READ) {
- iter = __tracing_open(inode, file);
+ iter = __tracing_open(inode, file, false);
if (IS_ERR(iter))
ret = PTR_ERR(iter);
else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -2899,6 +2921,8 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
+ buf[cnt] = 0;
+
trace_set_options(buf);
*ppos += cnt;
@@ -3012,10 +3036,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
int r;
mutex_lock(&trace_types_lock);
- if (current_trace)
- r = sprintf(buf, "%s\n", current_trace->name);
- else
- r = sprintf(buf, "\n");
+ r = sprintf(buf, "%s\n", current_trace->name);
mutex_unlock(&trace_types_lock);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3034,6 +3055,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val)
tr->data[cpu]->entries = val;
}
+/* resize @tr's buffer to the size of @size_tr's entries */
+static int resize_buffer_duplicate_size(struct trace_array *tr,
+ struct trace_array *size_tr, int cpu_id)
+{
+ int cpu, ret = 0;
+
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(tr->buffer,
+ size_tr->data[cpu]->entries, cpu);
+ if (ret < 0)
+ break;
+ tr->data[cpu]->entries = size_tr->data[cpu]->entries;
+ }
+ } else {
+ ret = ring_buffer_resize(tr->buffer,
+ size_tr->data[cpu_id]->entries, cpu_id);
+ if (ret == 0)
+ tr->data[cpu_id]->entries =
+ size_tr->data[cpu_id]->entries;
+ }
+
+ return ret;
+}
+
static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
{
int ret;
@@ -3058,23 +3104,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
ret = ring_buffer_resize(max_tr.buffer, size, cpu);
if (ret < 0) {
- int r = 0;
-
- if (cpu == RING_BUFFER_ALL_CPUS) {
- int i;
- for_each_tracing_cpu(i) {
- r = ring_buffer_resize(global_trace.buffer,
- global_trace.data[i]->entries,
- i);
- if (r < 0)
- break;
- }
- } else {
- r = ring_buffer_resize(global_trace.buffer,
- global_trace.data[cpu]->entries,
- cpu);
- }
-
+ int r = resize_buffer_duplicate_size(&global_trace,
+ &global_trace, cpu);
if (r < 0) {
/*
* AARGH! We are left with different
@@ -3171,6 +3202,7 @@ static int tracing_set_tracer(const char *buf)
static struct trace_option_dentry *topts;
struct trace_array *tr = &global_trace;
struct tracer *t;
+ bool had_max_tr;
int ret = 0;
mutex_lock(&trace_types_lock);
@@ -3195,9 +3227,21 @@ static int tracing_set_tracer(const char *buf)
goto out;
trace_branch_disable();
- if (current_trace && current_trace->reset)
+ if (current_trace->reset)
current_trace->reset(tr);
- if (current_trace && current_trace->use_max_tr) {
+
+ had_max_tr = current_trace->allocated_snapshot;
+ current_trace = &nop_trace;
+
+ if (had_max_tr && !t->use_max_tr) {
+ /*
+ * We need to make sure that the update_max_tr sees that
+ * current_trace changed to nop_trace to keep it from
+ * swapping the buffers after we resize it.
+ * The update_max_tr is called from interrupts disabled
+ * so a synchronized_sched() is sufficient.
+ */
+ synchronize_sched();
/*
* We don't free the ring buffer. instead, resize it because
* The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3205,24 +3249,19 @@ static int tracing_set_tracer(const char *buf)
*/
ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
set_buffer_entries(&max_tr, 1);
+ tracing_reset_online_cpus(&max_tr);
+ current_trace->allocated_snapshot = false;
}
destroy_trace_option_files(topts);
- current_trace = &nop_trace;
-
topts = create_trace_option_files(t);
- if (t->use_max_tr) {
- int cpu;
+ if (t->use_max_tr && !had_max_tr) {
/* we need to make per cpu buffer sizes equivalent */
- for_each_tracing_cpu(cpu) {
- ret = ring_buffer_resize(max_tr.buffer,
- global_trace.data[cpu]->entries,
- cpu);
- if (ret < 0)
- goto out;
- max_tr.data[cpu]->entries =
- global_trace.data[cpu]->entries;
- }
+ ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
+ RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ goto out;
+ t->allocated_snapshot = true;
}
if (t->init) {
@@ -3330,8 +3369,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
ret = -ENOMEM;
goto fail;
}
- if (current_trace)
- *iter->trace = *current_trace;
+ *iter->trace = *current_trace;
if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
ret = -ENOMEM;
@@ -3448,7 +3486,7 @@ static int tracing_wait_pipe(struct file *filp)
return -EINTR;
/*
- * We block until we read something and tracing is enabled.
+ * We block until we read something and tracing is disabled.
* We still block if tracing is disabled, but we have never
* read anything. This allows a user to cat this file, and
* then enable tracing. But after we have read something,
@@ -3456,7 +3494,7 @@ static int tracing_wait_pipe(struct file *filp)
*
* iter->pos will be 0 if we haven't read anything.
*/
- if (tracing_is_enabled() && iter->pos)
+ if (!tracing_is_enabled() && iter->pos)
break;
}
@@ -3471,7 +3509,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_iterator *iter = filp->private_data;
- static struct tracer *old_tracer;
ssize_t sret;
/* return any leftover data */
@@ -3483,10 +3520,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
- if (unlikely(old_tracer != current_trace && current_trace)) {
- old_tracer = current_trace;
+ if (unlikely(iter->trace->name != current_trace->name))
*iter->trace = *current_trace;
- }
mutex_unlock(&trace_types_lock);
/*
@@ -3642,7 +3677,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.ops = &tracing_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
- static struct tracer *old_tracer;
ssize_t ret;
size_t rem;
unsigned int i;
@@ -3652,10 +3686,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
- if (unlikely(old_tracer != current_trace && current_trace)) {
- old_tracer = current_trace;
+ if (unlikely(iter->trace->name != current_trace->name))
*iter->trace = *current_trace;
- }
mutex_unlock(&trace_types_lock);
mutex_lock(&iter->mutex);
@@ -4031,8 +4063,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
* Reset the buffer so that it doesn't have incomparable timestamps.
*/
tracing_reset_online_cpus(&global_trace);
- if (max_tr.buffer)
- tracing_reset_online_cpus(&max_tr);
+ tracing_reset_online_cpus(&max_tr);
mutex_unlock(&trace_types_lock);
@@ -4048,6 +4079,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
return single_open(file, tracing_clock_show, NULL);
}
+#ifdef CONFIG_TRACER_SNAPSHOT
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+ struct trace_iterator *iter;
+ int ret = 0;
+
+ if (file->f_mode & FMODE_READ) {
+ iter = __tracing_open(inode, file, true);
+ if (IS_ERR(iter))
+ ret = PTR_ERR(iter);
+ }
+ return ret;
+}
+
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ mutex_lock(&trace_types_lock);
+
+ if (current_trace->use_max_tr) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ switch (val) {
+ case 0:
+ if (current_trace->allocated_snapshot) {
+ /* free spare buffer */
+ ring_buffer_resize(max_tr.buffer, 1,
+ RING_BUFFER_ALL_CPUS);
+ set_buffer_entries(&max_tr, 1);
+ tracing_reset_online_cpus(&max_tr);
+ current_trace->allocated_snapshot = false;
+ }
+ break;
+ case 1:
+ if (!current_trace->allocated_snapshot) {
+ /* allocate spare buffer */
+ ret = resize_buffer_duplicate_size(&max_tr,
+ &global_trace, RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ break;
+ current_trace->allocated_snapshot = true;
+ }
+
+ local_irq_disable();
+ /* Now, we're going to swap */
+ update_max_tr(&global_trace, current, smp_processor_id());
+ local_irq_enable();
+ break;
+ default:
+ if (current_trace->allocated_snapshot)
+ tracing_reset_online_cpus(&max_tr);
+ else
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret >= 0) {
+ *ppos += cnt;
+ ret = cnt;
+ }
+out:
+ mutex_unlock(&trace_types_lock);
+ return ret;
+}
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -4104,6 +4216,16 @@ static const struct file_operations trace_clock_fops = {
.write = tracing_clock_write,
};
+#ifdef CONFIG_TRACER_SNAPSHOT
+static const struct file_operations snapshot_fops = {
+ .open = tracing_snapshot_open,
+ .read = seq_read,
+ .write = tracing_snapshot_write,
+ .llseek = tracing_seek,
+ .release = tracing_release,
+};
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
struct ftrace_buffer_info {
struct trace_array *tr;
void *spare;
@@ -4271,13 +4393,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -ENOMEM;
if (*ppos & (PAGE_SIZE - 1)) {
- WARN_ONCE(1, "Ftrace: previous read must page-align\n");
ret = -EINVAL;
goto out;
}
if (len & (PAGE_SIZE - 1)) {
- WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
if (len < PAGE_SIZE) {
ret = -EINVAL;
goto out;
@@ -4410,6 +4530,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
trace_seq_printf(s, "dropped events: %ld\n", cnt);
+ cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "read events: %ld\n", cnt);
+
count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
kfree(s);
@@ -4486,7 +4609,7 @@ struct dentry *tracing_init_dentry(void)
static struct dentry *d_percpu;
-struct dentry *tracing_dentry_percpu(void)
+static struct dentry *tracing_dentry_percpu(void)
{
static int once;
struct dentry *d_tracer;
@@ -4813,10 +4936,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
return ret;
if (buffer) {
- if (val)
+ mutex_lock(&trace_types_lock);
+ if (val) {
ring_buffer_record_on(buffer);
- else
+ if (current_trace->start)
+ current_trace->start(tr);
+ } else {
ring_buffer_record_off(buffer);
+ if (current_trace->stop)
+ current_trace->stop(tr);
+ }
+ mutex_unlock(&trace_types_lock);
}
(*ppos)++;
@@ -4895,6 +5025,11 @@ static __init int tracer_init_debugfs(void)
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ trace_create_file("snapshot", 0644, d_tracer,
+ (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
+#endif
+
create_trace_options_dir();
for_each_tracing_cpu(cpu)
@@ -5003,6 +5138,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
if (disable_tracing)
ftrace_kill();
+ /* Simulate the iterator */
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
@@ -5014,10 +5150,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
/* don't look at user memory in panic mode */
trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
- /* Simulate the iterator */
- iter.tr = &global_trace;
- iter.trace = current_trace;
-
switch (oops_dump_mode) {
case DUMP_ALL:
iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5162,7 +5294,7 @@ __init static int tracer_alloc_buffers(void)
init_irq_work(&trace_work_wakeup, trace_wake_up);
register_tracer(&nop_trace);
- current_trace = &nop_trace;
+
/* All seems OK, enable tracing */
tracing_disabled = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d7988902c..57d7e5397d56 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -287,20 +287,62 @@ struct tracer {
struct tracer_flags *flags;
bool print_max;
bool use_max_tr;
+ bool allocated_snapshot;
};
/* Only current can touch trace_recursion */
-#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
-#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
-/* Ring buffer has the 10 LSB bits to count */
-#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
-
-/* for function tracing recursion */
-#define TRACE_INTERNAL_BIT (1<<11)
-#define TRACE_GLOBAL_BIT (1<<12)
-#define TRACE_CONTROL_BIT (1<<13)
+/*
+ * For function tracing recursion:
+ * The order of these bits are important.
+ *
+ * When function tracing occurs, the following steps are made:
+ * If arch does not support a ftrace feature:
+ * call internal function (uses INTERNAL bits) which calls...
+ * If callback is registered to the "global" list, the list
+ * function is called and recursion checks the GLOBAL bits.
+ * then this function calls...
+ * The function callback, which can use the FTRACE bits to
+ * check for recursion.
+ *
+ * Now if the arch does not suppport a feature, and it calls
+ * the global list function which calls the ftrace callback
+ * all three of these steps will do a recursion protection.
+ * There's no reason to do one if the previous caller already
+ * did. The recursion that we are protecting against will
+ * go through the same steps again.
+ *
+ * To prevent the multiple recursion checks, if a recursion
+ * bit is set that is higher than the MAX bit of the current
+ * check, then we know that the check was made by the previous
+ * caller, and we can skip the current check.
+ */
+enum {
+ TRACE_BUFFER_BIT,
+ TRACE_BUFFER_NMI_BIT,
+ TRACE_BUFFER_IRQ_BIT,
+ TRACE_BUFFER_SIRQ_BIT,
+
+ /* Start of function recursion bits */
+ TRACE_FTRACE_BIT,
+ TRACE_FTRACE_NMI_BIT,
+ TRACE_FTRACE_IRQ_BIT,
+ TRACE_FTRACE_SIRQ_BIT,
+
+ /* GLOBAL_BITs must be greater than FTRACE_BITs */
+ TRACE_GLOBAL_BIT,
+ TRACE_GLOBAL_NMI_BIT,
+ TRACE_GLOBAL_IRQ_BIT,
+ TRACE_GLOBAL_SIRQ_BIT,
+
+ /* INTERNAL_BITs must be greater than GLOBAL_BITs */
+ TRACE_INTERNAL_BIT,
+ TRACE_INTERNAL_NMI_BIT,
+ TRACE_INTERNAL_IRQ_BIT,
+ TRACE_INTERNAL_SIRQ_BIT,
+
+ TRACE_CONTROL_BIT,
/*
* Abuse of the trace_recursion.
@@ -309,11 +351,77 @@ struct tracer {
* was called in irq context but we have irq tracing off. Since this
* can only be modified by current, we can reuse trace_recursion.
*/
-#define TRACE_IRQ_BIT (1<<13)
+ TRACE_IRQ_BIT,
+};
+
+#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
+#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
+#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
+
+#define TRACE_CONTEXT_BITS 4
+
+#define TRACE_FTRACE_START TRACE_FTRACE_BIT
+#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
+#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_LIST_START TRACE_INTERNAL_BIT
+#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
+
+static __always_inline int trace_get_context_bit(void)
+{
+ int bit;
-#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
-#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
-#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
+ if (in_interrupt()) {
+ if (in_nmi())
+ bit = 0;
+
+ else if (in_irq())
+ bit = 1;
+ else
+ bit = 2;
+ } else
+ bit = 3;
+
+ return bit;
+}
+
+static __always_inline int trace_test_and_set_recursion(int start, int max)
+{
+ unsigned int val = current->trace_recursion;
+ int bit;
+
+ /* A previous recursion check was made */
+ if ((val & TRACE_CONTEXT_MASK) > max)
+ return 0;
+
+ bit = trace_get_context_bit() + start;
+ if (unlikely(val & (1 << bit)))
+ return -1;
+
+ val |= 1 << bit;
+ current->trace_recursion = val;
+ barrier();
+
+ return bit;
+}
+
+static __always_inline void trace_clear_recursion(int bit)
+{
+ unsigned int val = current->trace_recursion;
+
+ if (!bit)
+ return;
+
+ bit = 1 << bit;
+ val &= ~bit;
+
+ barrier();
+ current->trace_recursion = val;
+}
#define TRACE_PIPE_ALL_CPU -1
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cbb..aa8f5f48dae6 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
#include <linux/ktime.h>
#include <linux/trace_clock.h>
-#include "trace.h"
-
/*
* trace_clock_local(): the simplest and least coherent tracing clock.
*
@@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)
return clock;
}
+EXPORT_SYMBOL_GPL(trace_clock_local);
/*
* trace_clock(): 'between' trace clock. Not completely serialized,
@@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void)
local_irq_save(flags);
this_cpu = raw_smp_processor_id();
- now = cpu_clock(this_cpu);
+ now = sched_clock_cpu(this_cpu);
/*
* If in an NMI context then dont risk lockups and return the
* cpu_clock() time:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d0b946..57e9b284250c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
__common_field(unsigned char, flags);
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
- __common_field(int, padding);
return ret;
}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad8082ab7..601152523326 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(tr);
}
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *pt_regs)
-{
- struct trace_array *tr = func_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
- int pc;
-
- if (unlikely(!ftrace_function_enabled))
- return;
-
- pc = preempt_count();
- preempt_disable_notrace();
- local_save_flags(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1))
- trace_function(tr, ip, parent_ip, flags, pc);
-
- atomic_dec(&data->disabled);
- preempt_enable_notrace();
-}
-
/* Our option */
enum {
TRACE_FUNC_OPT_STACK = 0x1,
@@ -85,34 +57,34 @@ static struct tracer_flags func_flags;
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
-
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
unsigned long flags;
- long disabled;
+ int bit;
int cpu;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
- /*
- * Need to use raw, since this must be called before the
- * recursive protection is performed.
- */
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
+ pc = preempt_count();
+ preempt_disable_notrace();
- if (likely(disabled == 1)) {
- pc = preempt_count();
+ bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+ if (bit < 0)
+ goto out;
+
+ cpu = smp_processor_id();
+ data = tr->data[cpu];
+ if (!atomic_read(&data->disabled)) {
+ local_save_flags(flags);
trace_function(tr, ip, parent_ip, flags, pc);
}
+ trace_clear_recursion(bit);
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ out:
+ preempt_enable_notrace();
}
static void
@@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)
{
ftrace_function_enabled = 0;
- if (trace_flags & TRACE_ITER_PREEMPTONLY)
- trace_ops.func = function_trace_call_preempt_only;
- else
- trace_ops.func = function_trace_call;
-
if (func_flags.val & TRACE_FUNC_OPT_STACK)
register_ftrace_function(&trace_stack_ops);
else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b74eb7e..39ada66389cc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -47,6 +47,8 @@ struct fgraph_data {
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
#define TRACE_GRAPH_PRINT_IRQS 0x40
+static unsigned int max_depth;
+
static struct tracer_opt trace_opts[] = {
/* Display overruns? (for self-debug purpose) */
{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
ftrace_pop_return_trace(&trace, &ret, frame_pointer);
trace.rettime = trace_clock_local();
- ftrace_graph_return(&trace);
barrier();
current->curr_ret_stack--;
+ /*
+ * The trace should run after decrementing the ret counter
+ * in case an interrupt were to come in. We don't want to
+ * lose the interrupt if max_depth is set.
+ */
+ ftrace_graph_return(&trace);
+
if (unlikely(!ret)) {
ftrace_graph_stop();
WARN_ON(1);
@@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return 0;
/* trace it when it is-nested-in or is a function enabled. */
- if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
- ftrace_graph_ignore_irqs())
+ if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
+ ftrace_graph_ignore_irqs()) ||
+ (max_depth && trace->depth >= max_depth))
return 0;
local_irq_save(flags);
@@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {
#endif
};
+
+static ssize_t
+graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ max_depth = val;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
+ int n;
+
+ n = sprintf(buf, "%d\n", max_depth);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static const struct file_operations graph_depth_fops = {
+ .open = tracing_open_generic,
+ .write = graph_depth_write,
+ .read = graph_depth_read,
+ .llseek = generic_file_llseek,
+};
+
+static __init int init_graph_debugfs(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ trace_create_file("max_graph_depth", 0644, d_tracer,
+ NULL, &graph_depth_fops);
+
+ return 0;
+}
+fs_initcall(init_graph_debugfs);
+
static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d79602dc7..697e88d13907 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -739,12 +739,11 @@ static int task_state_char(unsigned long state)
struct trace_event *ftrace_find_event(int type)
{
struct trace_event *event;
- struct hlist_node *n;
unsigned key;
key = type & (EVENT_HASHSIZE - 1);
- hlist_for_each_entry(event, n, &event_hash[key], node) {
+ hlist_for_each_entry(event, &event_hash[key], node) {
if (event->type == type)
return event;
}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 933708677814..5c7e09d10d74 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -66,7 +66,6 @@
#define TP_FLAG_TRACE 1
#define TP_FLAG_PROFILE 2
#define TP_FLAG_REGISTERED 4
-#define TP_FLAG_UPROBE 8
/* data_rloc: data relative location, compatible with u32 */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9fe45fcefca0..75aa97fbe1a1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,8 +15,8 @@
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/sched/rt.h>
#include <trace/events/sched.h>
-
#include "trace.h"
static struct trace_array *wakeup_trace;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 47623169a815..51c819c12c29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
* The ftrace infrastructure should provide the recursion
* protection. If not, this will crash the kernel!
*/
- trace_selftest_recursion_cnt++;
+ if (trace_selftest_recursion_cnt++ > 10)
+ return;
DYN_FTRACE_TEST_NAME();
}
@@ -452,7 +453,6 @@ trace_selftest_function_recursion(void)
char *func_name;
int len;
int ret;
- int cnt;
/* The previous test PASSED */
pr_cont("PASSED\n");
@@ -510,19 +510,10 @@ trace_selftest_function_recursion(void)
unregister_ftrace_function(&test_recsafe_probe);
- /*
- * If arch supports all ftrace features, and no other task
- * was on the list, we should be fine.
- */
- if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
- cnt = 2; /* Should have recursed */
- else
- cnt = 1;
-
ret = -1;
- if (trace_selftest_recursion_cnt != cnt) {
- pr_cont("*callback not called expected %d times (%d)* ",
- cnt, trace_selftest_recursion_cnt);
+ if (trace_selftest_recursion_cnt != 2) {
+ pr_cont("*callback not called expected 2 times (%d)* ",
+ trace_selftest_recursion_cnt);
goto out;
}
@@ -568,7 +559,7 @@ trace_selftest_function_regs(void)
int ret;
int supported = 0;
-#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
supported = 1;
#endif
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0c1b165778e5..42ca822fc701 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -33,7 +33,6 @@ static unsigned long max_stack_size;
static arch_spinlock_t max_stack_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-static int stack_trace_disabled __read_mostly;
static DEFINE_PER_CPU(int, trace_active);
static DEFINE_MUTEX(stack_sysctl_mutex);
@@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
{
int cpu;
- if (unlikely(!ftrace_enabled || stack_trace_disabled))
- return;
-
preempt_disable_notrace();
cpu = raw_smp_processor_id();
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6714c2..7a809e321058 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
#include <trace/syscall.h>
#include <trace/events/syscalls.h>
+#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
@@ -47,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
}
#endif
+#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
+/*
+ * Some architectures that allow for 32bit applications
+ * to run on a 64bit kernel, do not map the syscalls for
+ * the 32bit tasks the same as they do for 64bit tasks.
+ *
+ * *cough*x86*cough*
+ *
+ * In such a case, instead of reporting the wrong syscalls,
+ * simply ignore them.
+ *
+ * For an arch to ignore the compat syscalls it needs to
+ * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
+ * define the function arch_trace_is_compat_syscall() to let
+ * the tracing system know that it should ignore it.
+ */
+static int
+trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
+{
+ if (unlikely(arch_trace_is_compat_syscall(regs)))
+ return -1;
+
+ return syscall_get_nr(task, regs);
+}
+#else
+static inline int
+trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
+{
+ return syscall_get_nr(task, regs);
+}
+#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
+
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
{
@@ -77,7 +110,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
return syscalls_metadata[nr];
}
-enum print_line_t
+static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -130,7 +163,7 @@ end:
return TRACE_TYPE_HANDLED;
}
-enum print_line_t
+static enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -270,16 +303,16 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
return ret;
}
-void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
- int size;
int syscall_nr;
+ int size;
- syscall_nr = syscall_get_nr(current, regs);
+ syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
if (!test_bit(syscall_nr, enabled_enter_syscalls))
@@ -305,7 +338,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
@@ -313,7 +346,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
struct ring_buffer *buffer;
int syscall_nr;
- syscall_nr = syscall_get_nr(current, regs);
+ syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
if (!test_bit(syscall_nr, enabled_exit_syscalls))
@@ -337,7 +370,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-int reg_event_syscall_enter(struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct ftrace_event_call *call)
{
int ret = 0;
int num;
@@ -356,7 +389,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
return ret;
}
-void unreg_event_syscall_enter(struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct ftrace_event_call *call)
{
int num;
@@ -371,7 +404,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
mutex_unlock(&syscall_trace_lock);
}
-int reg_event_syscall_exit(struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct ftrace_event_call *call)
{
int ret = 0;
int num;
@@ -390,7 +423,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
return ret;
}
-void unreg_event_syscall_exit(struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct ftrace_event_call *call)
{
int num;
@@ -459,7 +492,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
return (unsigned long)sys_call_table[nr];
}
-int __init init_ftrace_syscalls(void)
+static int __init init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
unsigned long addr;
@@ -502,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int rctx;
int size;
- syscall_nr = syscall_get_nr(current, regs);
+ syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
@@ -578,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
int rctx;
int size;
- syscall_nr = syscall_get_nr(current, regs);
+ syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9614db8b0f8c..8dad2a92dee9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -22,25 +22,27 @@
#include <linux/uaccess.h>
#include <linux/uprobes.h>
#include <linux/namei.h>
+#include <linux/string.h>
#include "trace_probe.h"
#define UPROBE_EVENT_SYSTEM "uprobes"
+struct trace_uprobe_filter {
+ rwlock_t rwlock;
+ int nr_systemwide;
+ struct list_head perf_events;
+};
+
/*
* uprobe event core functions
*/
-struct trace_uprobe;
-struct uprobe_trace_consumer {
- struct uprobe_consumer cons;
- struct trace_uprobe *tu;
-};
-
struct trace_uprobe {
struct list_head list;
struct ftrace_event_class class;
struct ftrace_event_call call;
- struct uprobe_trace_consumer *consumer;
+ struct trace_uprobe_filter filter;
+ struct uprobe_consumer consumer;
struct inode *inode;
char *filename;
unsigned long offset;
@@ -63,6 +65,18 @@ static LIST_HEAD(uprobe_list);
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
+{
+ rwlock_init(&filter->rwlock);
+ filter->nr_systemwide = 0;
+ INIT_LIST_HEAD(&filter->perf_events);
+}
+
+static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
+{
+ return !filter->nr_systemwide && list_empty(&filter->perf_events);
+}
+
/*
* Allocate new trace_uprobe and initialize it (including uprobes).
*/
@@ -91,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
goto error;
INIT_LIST_HEAD(&tu->list);
+ tu->consumer.handler = uprobe_dispatcher;
+ init_trace_uprobe_filter(&tu->filter);
return tu;
error:
@@ -252,27 +268,32 @@ static int create_trace_uprobe(int argc, char **argv)
if (ret)
goto fail_address_parse;
+ inode = igrab(path.dentry->d_inode);
+ path_put(&path);
+
+ if (!inode || !S_ISREG(inode->i_mode)) {
+ ret = -EINVAL;
+ goto fail_address_parse;
+ }
+
ret = kstrtoul(arg, 0, &offset);
if (ret)
goto fail_address_parse;
- inode = igrab(path.dentry->d_inode);
-
argc -= 2;
argv += 2;
/* setup a probe */
if (!event) {
- char *tail = strrchr(filename, '/');
+ char *tail;
char *ptr;
- ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL);
- if (!ptr) {
+ tail = kstrdup(kbasename(filename), GFP_KERNEL);
+ if (!tail) {
ret = -ENOMEM;
goto fail_address_parse;
}
- tail = ptr;
ptr = strpbrk(tail, ".-_");
if (ptr)
*ptr = '\0';
@@ -356,7 +377,7 @@ fail_address_parse:
if (inode)
iput(inode);
- pr_info("Failed to parse address.\n");
+ pr_info("Failed to parse address or file.\n");
return ret;
}
@@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
};
/* uprobe handler */
-static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
{
struct uprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
@@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
unsigned long irq_flags;
struct ftrace_event_call *call = &tu->call;
- tu->nhit++;
-
local_save_flags(irq_flags);
pc = preempt_count();
@@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
size, irq_flags, pc);
if (!event)
- return;
+ return 0;
entry = ring_buffer_event_data(event);
- entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+ entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1];
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
if (!filter_current_check_discard(buffer, call, entry, event))
trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+
+ return 0;
}
/* Event entry printers */
@@ -533,42 +554,43 @@ partial:
return TRACE_TYPE_PARTIAL_LINE;
}
-static int probe_event_enable(struct trace_uprobe *tu, int flag)
+static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
{
- struct uprobe_trace_consumer *utc;
- int ret = 0;
+ return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
+}
- if (!tu->inode || tu->consumer)
- return -EINTR;
+typedef bool (*filter_func_t)(struct uprobe_consumer *self,
+ enum uprobe_filter_ctx ctx,
+ struct mm_struct *mm);
- utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
- if (!utc)
+static int
+probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
+{
+ int ret = 0;
+
+ if (is_trace_uprobe_enabled(tu))
return -EINTR;
- utc->cons.handler = uprobe_dispatcher;
- utc->cons.filter = NULL;
- ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
- if (ret) {
- kfree(utc);
- return ret;
- }
+ WARN_ON(!uprobe_filter_is_empty(&tu->filter));
tu->flags |= flag;
- utc->tu = tu;
- tu->consumer = utc;
+ tu->consumer.filter = filter;
+ ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ if (ret)
+ tu->flags &= ~flag;
- return 0;
+ return ret;
}
static void probe_event_disable(struct trace_uprobe *tu, int flag)
{
- if (!tu->inode || !tu->consumer)
+ if (!is_trace_uprobe_enabled(tu))
return;
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
+ WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+
+ uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
tu->flags &= ~flag;
- kfree(tu->consumer);
- tu->consumer = NULL;
}
static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
}
#ifdef CONFIG_PERF_EVENTS
+static bool
+__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
+{
+ struct perf_event *event;
+
+ if (filter->nr_systemwide)
+ return true;
+
+ list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
+ if (event->hw.tp_target->mm == mm)
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool
+uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+{
+ return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+}
+
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+{
+ bool done;
+
+ write_lock(&tu->filter.rwlock);
+ if (event->hw.tp_target) {
+ /*
+ * event->parent != NULL means copy_process(), we can avoid
+ * uprobe_apply(). current->mm must be probed and we can rely
+ * on dup_mmap() which preserves the already installed bp's.
+ *
+ * attr.enable_on_exec means that exec/mmap will install the
+ * breakpoints we need.
+ */
+ done = tu->filter.nr_systemwide ||
+ event->parent || event->attr.enable_on_exec ||
+ uprobe_filter_event(tu, event);
+ list_add(&event->hw.tp_list, &tu->filter.perf_events);
+ } else {
+ done = tu->filter.nr_systemwide;
+ tu->filter.nr_systemwide++;
+ }
+ write_unlock(&tu->filter.rwlock);
+
+ if (!done)
+ uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+
+ return 0;
+}
+
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+{
+ bool done;
+
+ write_lock(&tu->filter.rwlock);
+ if (event->hw.tp_target) {
+ list_del(&event->hw.tp_list);
+ done = tu->filter.nr_systemwide ||
+ (event->hw.tp_target->flags & PF_EXITING) ||
+ uprobe_filter_event(tu, event);
+ } else {
+ tu->filter.nr_systemwide--;
+ done = tu->filter.nr_systemwide;
+ }
+ write_unlock(&tu->filter.rwlock);
+
+ if (!done)
+ uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+
+ return 0;
+}
+
+static bool uprobe_perf_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct trace_uprobe *tu;
+ int ret;
+
+ tu = container_of(uc, struct trace_uprobe, consumer);
+ read_lock(&tu->filter.rwlock);
+ ret = __uprobe_perf_filter(&tu->filter, mm);
+ read_unlock(&tu->filter.rwlock);
+
+ return ret;
+}
+
/* uprobe profile handler */
-static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tu->call;
struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
int size, __size, i;
int rctx;
+ if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+ return UPROBE_HANDLER_REMOVE;
+
__size = sizeof(*entry) + tu->size;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
- return;
+ return 0;
preempt_disable();
@@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
if (!entry)
goto out;
- entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+ entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1];
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
out:
preempt_enable();
+ return 0;
}
#endif /* CONFIG_PERF_EVENTS */
@@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
switch (type) {
case TRACE_REG_REGISTER:
- return probe_event_enable(tu, TP_FLAG_TRACE);
+ return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
case TRACE_REG_UNREGISTER:
probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return probe_event_enable(tu, TP_FLAG_PROFILE);
+ return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
case TRACE_REG_PERF_UNREGISTER:
probe_event_disable(tu, TP_FLAG_PROFILE);
return 0;
+
+ case TRACE_REG_PERF_OPEN:
+ return uprobe_perf_open(tu, data);
+
+ case TRACE_REG_PERF_CLOSE:
+ return uprobe_perf_close(tu, data);
+
#endif
default:
return 0;
@@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
{
- struct uprobe_trace_consumer *utc;
struct trace_uprobe *tu;
+ int ret = 0;
- utc = container_of(con, struct uprobe_trace_consumer, cons);
- tu = utc->tu;
- if (!tu || tu->consumer != utc)
- return 0;
+ tu = container_of(con, struct trace_uprobe, consumer);
+ tu->nhit++;
if (tu->flags & TP_FLAG_TRACE)
- uprobe_trace_func(tu, regs);
+ ret |= uprobe_trace_func(tu, regs);
#ifdef CONFIG_PERF_EVENTS
if (tu->flags & TP_FLAG_PROFILE)
- uprobe_perf_func(tu, regs);
+ ret |= uprobe_perf_func(tu, regs);
#endif
- return 0;
+ return ret;
}
static struct trace_event_functions uprobe_funcs = {
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d96ba22dabfa..0c05a4592047 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
static struct tracepoint_entry *get_tracepoint(const char *name)
{
struct hlist_head *head;
- struct hlist_node *node;
struct tracepoint_entry *e;
u32 hash = jhash(name, strlen(name), 0);
head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, node, head, hlist) {
+ hlist_for_each_entry(e, head, hlist) {
if (!strcmp(name, e->name))
return e;
}
@@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name)
static struct tracepoint_entry *add_tracepoint(const char *name)
{
struct hlist_head *head;
- struct hlist_node *node;
struct tracepoint_entry *e;
size_t name_len = strlen(name) + 1;
u32 hash = jhash(name, name_len-1, 0);
head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, node, head, hlist) {
+ hlist_for_each_entry(e, head, hlist) {
if (!strcmp(name, e->name)) {
printk(KERN_NOTICE
"tracepoint %s busy\n", name);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b44690..a1dd9a1b1327 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
{
const struct cred *tcred;
struct timespec uptime, ts;
+ cputime_t utime, stime, utimescaled, stimescaled;
u64 ac_etime;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_ppid = pid_alive(tsk) ?
task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
rcu_read_unlock();
- stats->ac_utime = cputime_to_usecs(tsk->utime);
- stats->ac_stime = cputime_to_usecs(tsk->stime);
- stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
- stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
+
+ task_cputime(tsk, &utime, &stime);
+ stats->ac_utime = cputime_to_usecs(utime);
+ stats->ac_stime = cputime_to_usecs(stime);
+
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ stats->ac_utimescaled = cputime_to_usecs(utimescaled);
+ stats->ac_stimescaled = cputime_to_usecs(stimescaled);
+
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;
@@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
#undef KB
#undef MB
-/**
- * acct_update_integrals - update mm integral fields in task_struct
- * @tsk: task_struct for accounting
- */
-void acct_update_integrals(struct task_struct *tsk)
+static void __acct_update_integrals(struct task_struct *tsk,
+ cputime_t utime, cputime_t stime)
{
if (likely(tsk->mm)) {
cputime_t time, dtime;
@@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk)
u64 delta;
local_irq_save(flags);
- time = tsk->stime + tsk->utime;
+ time = stime + utime;
dtime = time - tsk->acct_timexpd;
jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
delta = value.tv_sec;
@@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk)
}
/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+ cputime_t utime, stime;
+
+ task_cputime(tsk, &utime, &stime);
+ __acct_update_integrals(tsk, utime, stime);
+}
+
+/**
+ * acct_account_cputime - update mm integral after cputime update
+ * @tsk: task_struct for accounting
+ */
+void acct_account_cputime(struct task_struct *tsk)
+{
+ __acct_update_integrals(tsk, tsk->utime, tsk->stime);
+}
+
+/**
* acct_clear_integrals - clear the mm integral fields in task_struct
* @tsk: task_struct whose accounting fields are cleared
*/
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1fb..394f70b17162 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
void fire_user_return_notifiers(void)
{
struct user_return_notifier *urn;
- struct hlist_node *tmp1, *tmp2;
+ struct hlist_node *tmp2;
struct hlist_head *head;
head = &get_cpu_var(return_notifier_list);
- hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+ hlist_for_each_entry_safe(urn, tmp2, head, link)
urn->on_user_return(urn);
put_cpu_var(return_notifier_list);
}
diff --git a/kernel/user.c b/kernel/user.c
index 750acffbe9ec..e81978e8c03b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
/*
* userns count is 1 for root user, 1 for init_uts_ns,
@@ -46,11 +47,10 @@ struct user_namespace init_user_ns = {
.count = 4294967295U,
},
},
- .kref = {
- .refcount = ATOMIC_INIT(3),
- },
+ .count = ATOMIC_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
+ .proc_inum = PROC_USER_INIT_INO,
};
EXPORT_SYMBOL_GPL(init_user_ns);
@@ -105,9 +105,8 @@ static void uid_hash_remove(struct user_struct *up)
static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
{
struct user_struct *user;
- struct hlist_node *h;
- hlist_for_each_entry(user, h, hashent, uidhash_node) {
+ hlist_for_each_entry(user, hashent, uidhash_node) {
if (uid_eq(user->uid, uid)) {
atomic_inc(&user->__count);
return user;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 456a6b9fba34..8b650837083e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
@@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
+static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
+{
+ /* Start with the same capabilities as init but useless for doing
+ * anything as the capabilities are bound to the new user namespace.
+ */
+ cred->securebits = SECUREBITS_DEFAULT;
+ cred->cap_inheritable = CAP_EMPTY_SET;
+ cred->cap_permitted = CAP_FULL_SET;
+ cred->cap_effective = CAP_FULL_SET;
+ cred->cap_bset = CAP_FULL_SET;
+#ifdef CONFIG_KEYS
+ key_put(cred->request_key_auth);
+ cred->request_key_auth = NULL;
+#endif
+ /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
+ cred->user_ns = user_ns;
+}
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -39,6 +58,7 @@ int create_user_ns(struct cred *new)
struct user_namespace *ns, *parent_ns = new->user_ns;
kuid_t owner = new->euid;
kgid_t group = new->egid;
+ int ret;
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
@@ -52,40 +72,48 @@ int create_user_ns(struct cred *new)
if (!ns)
return -ENOMEM;
- kref_init(&ns->kref);
+ ret = proc_alloc_inum(&ns->proc_inum);
+ if (ret) {
+ kmem_cache_free(user_ns_cachep, ns);
+ return ret;
+ }
+
+ atomic_set(&ns->count, 1);
+ /* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->owner = owner;
ns->group = group;
- /* Start with the same capabilities as init but useless for doing
- * anything as the capabilities are bound to the new user namespace.
- */
- new->securebits = SECUREBITS_DEFAULT;
- new->cap_inheritable = CAP_EMPTY_SET;
- new->cap_permitted = CAP_FULL_SET;
- new->cap_effective = CAP_FULL_SET;
- new->cap_bset = CAP_FULL_SET;
-#ifdef CONFIG_KEYS
- key_put(new->request_key_auth);
- new->request_key_auth = NULL;
-#endif
- /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-
- /* Leave the new->user_ns reference with the new user namespace. */
- /* Leave the reference to our user_ns with the new cred. */
- new->user_ns = ns;
+ set_cred_user_ns(new, ns);
return 0;
}
-void free_user_ns(struct kref *kref)
+int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
- struct user_namespace *parent, *ns =
- container_of(kref, struct user_namespace, kref);
+ struct cred *cred;
- parent = ns->parent;
- kmem_cache_free(user_ns_cachep, ns);
- put_user_ns(parent);
+ if (!(unshare_flags & CLONE_NEWUSER))
+ return 0;
+
+ cred = prepare_creds();
+ if (!cred)
+ return -ENOMEM;
+
+ *new_cred = cred;
+ return create_user_ns(cred);
+}
+
+void free_user_ns(struct user_namespace *ns)
+{
+ struct user_namespace *parent;
+
+ do {
+ parent = ns->parent;
+ proc_free_inum(ns->proc_inum);
+ kmem_cache_free(user_ns_cachep, ns);
+ ns = parent;
+ } while (atomic_dec_and_test(&parent->count));
}
EXPORT_SYMBOL(free_user_ns);
@@ -372,7 +400,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
struct user_namespace *lower_ns;
uid_t lower;
- lower_ns = current_user_ns();
+ lower_ns = seq_user_ns(seq);
if ((lower_ns == ns) && lower_ns->parent)
lower_ns = lower_ns->parent;
@@ -393,7 +421,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
struct user_namespace *lower_ns;
gid_t lower;
- lower_ns = current_user_ns();
+ lower_ns = seq_user_ns(seq);
if ((lower_ns == ns) && lower_ns->parent)
lower_ns = lower_ns->parent;
@@ -492,6 +520,42 @@ struct seq_operations proc_projid_seq_operations = {
.show = projid_m_show,
};
+static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
+{
+ u32 upper_first, lower_first, upper_last, lower_last;
+ unsigned idx;
+
+ upper_first = extent->first;
+ lower_first = extent->lower_first;
+ upper_last = upper_first + extent->count - 1;
+ lower_last = lower_first + extent->count - 1;
+
+ for (idx = 0; idx < new_map->nr_extents; idx++) {
+ u32 prev_upper_first, prev_lower_first;
+ u32 prev_upper_last, prev_lower_last;
+ struct uid_gid_extent *prev;
+
+ prev = &new_map->extent[idx];
+
+ prev_upper_first = prev->first;
+ prev_lower_first = prev->lower_first;
+ prev_upper_last = prev_upper_first + prev->count - 1;
+ prev_lower_last = prev_lower_first + prev->count - 1;
+
+ /* Does the upper range intersect a previous extent? */
+ if ((prev_upper_first <= upper_last) &&
+ (prev_upper_last >= upper_first))
+ return true;
+
+ /* Does the lower range intersect a previous extent? */
+ if ((prev_lower_first <= lower_last) &&
+ (prev_lower_last >= lower_first))
+ return true;
+ }
+ return false;
+}
+
+
static DEFINE_MUTEX(id_map_mutex);
static ssize_t map_write(struct file *file, const char __user *buf,
@@ -504,7 +568,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct user_namespace *ns = seq->private;
struct uid_gid_map new_map;
unsigned idx;
- struct uid_gid_extent *extent, *last = NULL;
+ struct uid_gid_extent *extent = NULL;
unsigned long page = 0;
char *kbuf, *pos, *next_line;
ssize_t ret = -EINVAL;
@@ -607,14 +671,11 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if ((extent->lower_first + extent->count) <= extent->lower_first)
goto out;
- /* For now only accept extents that are strictly in order */
- if (last &&
- (((last->first + last->count) > extent->first) ||
- ((last->lower_first + last->count) > extent->lower_first)))
+ /* Do the ranges in extent overlap any previous extents? */
+ if (mappings_overlap(&new_map, extent))
goto out;
new_map.nr_extents++;
- last = extent;
/* Fail if the file contains too many extents */
if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
@@ -669,10 +730,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
{
struct seq_file *seq = file->private_data;
struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
if (!ns->parent)
return -EPERM;
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
return map_write(file, buf, size, ppos, CAP_SETUID,
&ns->uid_map, &ns->parent->uid_map);
}
@@ -681,10 +746,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
{
struct seq_file *seq = file->private_data;
struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
if (!ns->parent)
return -EPERM;
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
return map_write(file, buf, size, ppos, CAP_SETGID,
&ns->gid_map, &ns->parent->gid_map);
}
@@ -709,6 +778,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
+ /* Allow mapping to your own filesystem ids */
+ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+ u32 id = new_map->extent[0].lower_first;
+ if (cap_setid == CAP_SETUID) {
+ kuid_t uid = make_kuid(ns->parent, id);
+ if (uid_eq(uid, current_fsuid()))
+ return true;
+ }
+ else if (cap_setid == CAP_SETGID) {
+ kgid_t gid = make_kgid(ns->parent, id);
+ if (gid_eq(gid, current_fsgid()))
+ return true;
+ }
+ }
+
/* Allow anyone to set a mapping that doesn't require privilege */
if (!cap_valid(cap_setid))
return true;
@@ -722,6 +806,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
return false;
}
+static void *userns_get(struct task_struct *task)
+{
+ struct user_namespace *user_ns;
+
+ rcu_read_lock();
+ user_ns = get_user_ns(__task_cred(task)->user_ns);
+ rcu_read_unlock();
+
+ return user_ns;
+}
+
+static void userns_put(void *ns)
+{
+ put_user_ns(ns);
+}
+
+static int userns_install(struct nsproxy *nsproxy, void *ns)
+{
+ struct user_namespace *user_ns = ns;
+ struct cred *cred;
+
+ /* Don't allow gaining capabilities by reentering
+ * the same user namespace.
+ */
+ if (user_ns == current_user_ns())
+ return -EINVAL;
+
+ /* Threaded processes may not enter a different user namespace */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cred = prepare_creds();
+ if (!cred)
+ return -ENOMEM;
+
+ put_user_ns(cred->user_ns);
+ set_cred_user_ns(cred, get_user_ns(user_ns));
+
+ return commit_creds(cred);
+}
+
+static unsigned int userns_inum(void *ns)
+{
+ struct user_namespace *user_ns = ns;
+ return user_ns->proc_inum;
+}
+
+const struct proc_ns_operations userns_operations = {
+ .name = "user",
+ .type = CLONE_NEWUSER,
+ .get = userns_get,
+ .put = userns_put,
+ .install = userns_install,
+ .inum = userns_inum,
+};
+
static __init int user_namespaces_init(void)
{
user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 679d97a5d3fd..a47fc5de3113 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -30,20 +30,27 @@ static struct uts_namespace *create_uts_ns(void)
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
- * Return NULL on error (failure to kmalloc), new ns otherwise
+ * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
*/
-static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
+ int err;
ns = create_uts_ns();
if (!ns)
return ERR_PTR(-ENOMEM);
+ err = proc_alloc_inum(&ns->proc_inum);
+ if (err) {
+ kfree(ns);
+ return ERR_PTR(err);
+ }
+
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
- ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
+ ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
return ns;
}
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
* versa.
*/
struct uts_namespace *copy_utsname(unsigned long flags,
- struct task_struct *tsk)
+ struct user_namespace *user_ns, struct uts_namespace *old_ns)
{
- struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
struct uts_namespace *new_ns;
BUG_ON(!old_ns);
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
if (!(flags & CLONE_NEWUTS))
return old_ns;
- new_ns = clone_uts_ns(tsk, old_ns);
+ new_ns = clone_uts_ns(user_ns, old_ns);
put_uts_ns(old_ns);
return new_ns;
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)
ns = container_of(kref, struct uts_namespace, kref);
put_user_ns(ns->user_ns);
+ proc_free_inum(ns->proc_inum);
kfree(ns);
}
@@ -102,19 +109,32 @@ static void utsns_put(void *ns)
put_uts_ns(ns);
}
-static int utsns_install(struct nsproxy *nsproxy, void *ns)
+static int utsns_install(struct nsproxy *nsproxy, void *new)
{
+ struct uts_namespace *ns = new;
+
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !nsown_capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
get_uts_ns(ns);
put_uts_ns(nsproxy->uts_ns);
nsproxy->uts_ns = ns;
return 0;
}
+static unsigned int utsns_inum(void *vp)
+{
+ struct uts_namespace *ns = vp;
+
+ return ns->proc_inum;
+}
+
const struct proc_ns_operations utsns_operations = {
.name = "uts",
.type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
+ .inum = utsns_inum,
};
-
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d820..4f69f9a5e221 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,6 +15,8 @@
#include <linux/sysctl.h>
#include <linux/wait.h>
+#ifdef CONFIG_PROC_SYSCTL
+
static void *get_uts(ctl_table *table, int write)
{
char *which = table->data;
@@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which)
up_write(&uts_sem);
}
-#ifdef CONFIG_PROC_SYSCTL
/*
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c8c21be11ab4..4a944676358e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/smpboot.h>
+#include <linux/sched/rt.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -31,6 +32,7 @@
int watchdog_enabled = 1;
int __read_mostly watchdog_thresh = 10;
static int __read_mostly watchdog_disabled;
+static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -111,12 +113,12 @@ static int get_softlockup_thresh(void)
* resolution, and we don't need to waste time with a big divide when
* 2^30ns == 1.074s.
*/
-static unsigned long get_timestamp(int this_cpu)
+static unsigned long get_timestamp(void)
{
- return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
+ return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
}
-static u64 get_sample_period(void)
+static void set_sample_period(void)
{
/*
* convert watchdog_thresh from seconds to ns
@@ -125,15 +127,13 @@ static u64 get_sample_period(void)
* and hard thresholds) to increment before the
* hardlockup detector generates a warning
*/
- return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
}
/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
- int this_cpu = smp_processor_id();
-
- __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
+ __this_cpu_write(watchdog_touch_ts, get_timestamp());
}
void touch_softlockup_watchdog(void)
@@ -194,7 +194,7 @@ static int is_hardlockup(void)
static int is_softlockup(unsigned long touch_ts)
{
- unsigned long now = get_timestamp(smp_processor_id());
+ unsigned long now = get_timestamp();
/* Warn about unreasonable delays: */
if (time_after(now, touch_ts + get_softlockup_thresh()))
@@ -275,7 +275,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
wake_up_process(__this_cpu_read(softlockup_watchdog));
/* .. and repeat */
- hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
if (touch_ts == 0) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -343,6 +343,10 @@ static void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+ /* kick off the timer for the hardlockup detector */
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer->function = watchdog_timer_fn;
+
if (!watchdog_enabled) {
kthread_park(current);
return;
@@ -351,12 +355,8 @@ static void watchdog_enable(unsigned int cpu)
/* Enable the perf event */
watchdog_nmi_enable(cpu);
- /* kick off the timer for the hardlockup detector */
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtimer->function = watchdog_timer_fn;
-
/* done here because hrtimer_start can only pin to smp_processor_id() */
- hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED);
/* initialize timestamp */
@@ -368,9 +368,6 @@ static void watchdog_disable(unsigned int cpu)
{
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- if (!watchdog_enabled)
- return;
-
watchdog_set_prio(SCHED_NORMAL, 0);
hrtimer_cancel(hrtimer);
/* disable the perf event */
@@ -386,7 +383,7 @@ static int watchdog_should_run(unsigned int cpu)
/*
* The watchdog thread function - touches the timestamp.
*
- * It only runs once every get_sample_period() seconds (4 seconds by
+ * It only runs once every sample_period seconds (4 seconds by
* default) to reset the softlockup timestamp. If this gets delayed
* for more than 2*watchdog_thresh seconds then the debug-printout
* triggers in watchdog_timer_fn().
@@ -519,6 +516,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
if (ret || !write)
return ret;
+ set_sample_period();
if (watchdog_enabled && watchdog_thresh)
watchdog_enable_all_cpus();
else
@@ -540,6 +538,7 @@ static struct smp_hotplug_thread watchdog_threads = {
void __init lockup_detector_init(void)
{
+ set_sample_period();
if (smpboot_register_percpu_thread(&watchdog_threads)) {
pr_err("Failed to create watchdog threads, disabled\n");
watchdog_disabled = -ENODEV;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fbc6576a83c3..81f2457811eb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,32 +41,31 @@
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
+#include <linux/hashtable.h>
-#include "workqueue_sched.h"
+#include "workqueue_internal.h"
enum {
/*
- * global_cwq flags
+ * worker_pool flags
*
- * A bound gcwq is either associated or disassociated with its CPU.
+ * A bound pool is either associated or disassociated with its CPU.
* While associated (!DISASSOCIATED), all workers are bound to the
* CPU and none has %WORKER_UNBOUND set and concurrency management
* is in effect.
*
* While DISASSOCIATED, the cpu may be offline and all workers have
* %WORKER_UNBOUND set and concurrency management disabled, and may
- * be executing on any CPU. The gcwq behaves as an unbound one.
+ * be executing on any CPU. The pool behaves as an unbound one.
*
* Note that DISASSOCIATED can be flipped only while holding
- * assoc_mutex of all pools on the gcwq to avoid changing binding
- * state while create_worker() is in progress.
+ * assoc_mutex to avoid changing binding state while
+ * create_worker() is in progress.
*/
- GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
- GCWQ_FREEZING = 1 << 1, /* freeze in progress */
-
- /* pool flags */
POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
+ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
+ POOL_FREEZING = 1 << 3, /* freeze in progress */
/* worker flags */
WORKER_STARTED = 1 << 0, /* started */
@@ -79,11 +78,9 @@ enum {
WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
WORKER_CPU_INTENSIVE,
- NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
+ NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
- BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
- BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
@@ -111,48 +108,24 @@ enum {
* P: Preemption protected. Disabling preemption is enough and should
* only be modified and accessed from the local cpu.
*
- * L: gcwq->lock protected. Access with gcwq->lock held.
+ * L: pool->lock protected. Access with pool->lock held.
*
- * X: During normal operation, modification requires gcwq->lock and
- * should be done only from local cpu. Either disabling preemption
- * on local cpu or grabbing gcwq->lock is enough for read access.
- * If GCWQ_DISASSOCIATED is set, it's identical to L.
+ * X: During normal operation, modification requires pool->lock and should
+ * be done only from local cpu. Either disabling preemption on local
+ * cpu or grabbing pool->lock is enough for read access. If
+ * POOL_DISASSOCIATED is set, it's identical to L.
*
* F: wq->flush_mutex protected.
*
* W: workqueue_lock protected.
*/
-struct global_cwq;
-struct worker_pool;
-
-/*
- * The poor guys doing the actual heavy lifting. All on-duty workers
- * are either serving the manager role, on idle list or on busy hash.
- */
-struct worker {
- /* on idle list while idle, on busy hash table while busy */
- union {
- struct list_head entry; /* L: while idle */
- struct hlist_node hentry; /* L: while busy */
- };
-
- struct work_struct *current_work; /* L: work being processed */
- struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
- struct list_head scheduled; /* L: scheduled works */
- struct task_struct *task; /* I: worker task */
- struct worker_pool *pool; /* I: the associated pool */
- /* 64 bytes boundary on 64bit, 32 on 32bit */
- unsigned long last_active; /* L: last active timestamp */
- unsigned int flags; /* X: flags */
- int id; /* I: worker id */
-
- /* for rebinding worker to CPU */
- struct work_struct rebind_work; /* L: for busy worker */
-};
+/* struct worker is defined in workqueue_internal.h */
struct worker_pool {
- struct global_cwq *gcwq; /* I: the owning gcwq */
+ spinlock_t lock; /* the pool lock */
+ unsigned int cpu; /* I: the associated cpu */
+ int id; /* I: pool ID */
unsigned int flags; /* X: flags */
struct list_head worklist; /* L: list of pending works */
@@ -165,34 +138,28 @@ struct worker_pool {
struct timer_list idle_timer; /* L: worker idle timeout */
struct timer_list mayday_timer; /* L: SOS timer for workers */
- struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */
- struct ida worker_ida; /* L: for worker IDs */
-};
-
-/*
- * Global per-cpu workqueue. There's one and only one for each cpu
- * and all works are queued and processed here regardless of their
- * target workqueues.
- */
-struct global_cwq {
- spinlock_t lock; /* the gcwq lock */
- unsigned int cpu; /* I: the associated cpu */
- unsigned int flags; /* L: GCWQ_* flags */
-
- /* workers are chained either in busy_hash or pool idle_list */
- struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
+ /* workers are chained either in busy_hash or idle_list */
+ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
- struct worker_pool pools[NR_WORKER_POOLS];
- /* normal and highpri pools */
+ struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */
+ struct ida worker_ida; /* L: for worker IDs */
+
+ /*
+ * The current concurrency level. As it's likely to be accessed
+ * from other CPUs during try_to_wake_up(), put it in a separate
+ * cacheline.
+ */
+ atomic_t nr_running ____cacheline_aligned_in_smp;
} ____cacheline_aligned_in_smp;
/*
- * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
- * work_struct->data are used for flags and thus cwqs need to be
- * aligned at two's power of the number of flag bits.
+ * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
+ * of work_struct->data are used for flags and the remaining high bits
+ * point to the pwq; thus, pwqs need to be aligned at two's power of the
+ * number of flag bits.
*/
-struct cpu_workqueue_struct {
+struct pool_workqueue {
struct worker_pool *pool; /* I: the associated pool */
struct workqueue_struct *wq; /* I: the owning workqueue */
int work_color; /* L: current color */
@@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t;
struct workqueue_struct {
unsigned int flags; /* W: WQ_* flags */
union {
- struct cpu_workqueue_struct __percpu *pcpu;
- struct cpu_workqueue_struct *single;
+ struct pool_workqueue __percpu *pcpu;
+ struct pool_workqueue *single;
unsigned long v;
- } cpu_wq; /* I: cwq's */
+ } pool_wq; /* I: pwq's */
struct list_head list; /* W: list of all workqueues */
struct mutex flush_mutex; /* protects wq flushing */
int work_color; /* F: current work color */
int flush_color; /* F: current flush color */
- atomic_t nr_cwqs_to_flush; /* flush in progress */
+ atomic_t nr_pwqs_to_flush; /* flush in progress */
struct wq_flusher *first_flusher; /* F: first flusher */
struct list_head flusher_queue; /* F: flush waiters */
struct list_head flusher_overflow; /* F: flush overflow list */
@@ -259,7 +226,7 @@ struct workqueue_struct {
struct worker *rescuer; /* I: rescue worker */
int nr_drainers; /* W: drain in progress */
- int saved_max_active; /* W: saved cwq max_active */
+ int saved_max_active; /* W: saved pwq max_active */
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
@@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
-#define for_each_worker_pool(pool, gcwq) \
- for ((pool) = &(gcwq)->pools[0]; \
- (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
+#define for_each_std_worker_pool(pool, cpu) \
+ for ((pool) = &std_worker_pools(cpu)[0]; \
+ (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
-#define for_each_busy_worker(worker, i, pos, gcwq) \
- for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
- hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+#define for_each_busy_worker(worker, i, pool) \
+ hash_for_each(pool->busy_hash, i, worker, hentry)
-static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
- unsigned int sw)
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+ unsigned int sw)
{
if (cpu < nr_cpu_ids) {
if (sw & 1) {
@@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
if (sw & 2)
return WORK_CPU_UNBOUND;
}
- return WORK_CPU_NONE;
+ return WORK_CPU_END;
}
-static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
- struct workqueue_struct *wq)
+static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
+ struct workqueue_struct *wq)
{
- return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+ return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
}
/*
* CPU iterators
*
- * An extra gcwq is defined for an invalid cpu number
+ * An extra cpu number is defined using an invalid cpu number
* (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
- * specific CPU. The following iterators are similar to
- * for_each_*_cpu() iterators but also considers the unbound gcwq.
+ * specific CPU. The following iterators are similar to for_each_*_cpu()
+ * iterators but also considers the unbound CPU.
*
- * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
- * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
- * for_each_cwq_cpu() : possible CPUs for bound workqueues,
+ * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND
+ * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND
+ * for_each_pwq_cpu() : possible CPUs for bound workqueues,
* WORK_CPU_UNBOUND for unbound workqueues
*/
-#define for_each_gcwq_cpu(cpu) \
- for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
- (cpu) < WORK_CPU_NONE; \
- (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+#define for_each_wq_cpu(cpu) \
+ for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \
+ (cpu) < WORK_CPU_END; \
+ (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
-#define for_each_online_gcwq_cpu(cpu) \
- for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
- (cpu) < WORK_CPU_NONE; \
- (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+#define for_each_online_wq_cpu(cpu) \
+ for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \
+ (cpu) < WORK_CPU_END; \
+ (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
-#define for_each_cwq_cpu(cpu, wq) \
- for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
- (cpu) < WORK_CPU_NONE; \
- (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+#define for_each_pwq_cpu(cpu, wq) \
+ for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \
+ (cpu) < WORK_CPU_END; \
+ (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
#ifdef CONFIG_DEBUG_OBJECTS_WORK
@@ -459,57 +425,69 @@ static LIST_HEAD(workqueues);
static bool workqueue_freezing; /* W: have wqs started freezing? */
/*
- * The almighty global cpu workqueues. nr_running is the only field
- * which is expected to be used frequently by other cpus via
- * try_to_wake_up(). Put it in a separate cacheline.
+ * The CPU and unbound standard worker pools. The unbound ones have
+ * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
*/
-static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
+ cpu_std_worker_pools);
+static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
-/*
- * Global cpu workqueue and nr_running counter for unbound gcwq. The
- * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
- * workers have WORKER_UNBOUND set.
- */
-static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
- [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
-};
+/* idr of all pools */
+static DEFINE_MUTEX(worker_pool_idr_mutex);
+static DEFINE_IDR(worker_pool_idr);
static int worker_thread(void *__worker);
-static int worker_pool_pri(struct worker_pool *pool)
+static struct worker_pool *std_worker_pools(int cpu)
{
- return pool - pool->gcwq->pools;
+ if (cpu != WORK_CPU_UNBOUND)
+ return per_cpu(cpu_std_worker_pools, cpu);
+ else
+ return unbound_std_worker_pools;
}
-static struct global_cwq *get_gcwq(unsigned int cpu)
+static int std_worker_pool_pri(struct worker_pool *pool)
{
- if (cpu != WORK_CPU_UNBOUND)
- return &per_cpu(global_cwq, cpu);
- else
- return &unbound_global_cwq;
+ return pool - std_worker_pools(pool->cpu);
}
-static atomic_t *get_pool_nr_running(struct worker_pool *pool)
+/* allocate ID and assign it to @pool */
+static int worker_pool_assign_id(struct worker_pool *pool)
{
- int cpu = pool->gcwq->cpu;
- int idx = worker_pool_pri(pool);
+ int ret;
- if (cpu != WORK_CPU_UNBOUND)
- return &per_cpu(pool_nr_running, cpu)[idx];
- else
- return &unbound_pool_nr_running[idx];
+ mutex_lock(&worker_pool_idr_mutex);
+ idr_pre_get(&worker_pool_idr, GFP_KERNEL);
+ ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
+ mutex_unlock(&worker_pool_idr_mutex);
+
+ return ret;
}
-static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
- struct workqueue_struct *wq)
+/*
+ * Lookup worker_pool by id. The idr currently is built during boot and
+ * never modified. Don't worry about locking for now.
+ */
+static struct worker_pool *worker_pool_by_id(int pool_id)
+{
+ return idr_find(&worker_pool_idr, pool_id);
+}
+
+static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
+{
+ struct worker_pool *pools = std_worker_pools(cpu);
+
+ return &pools[highpri];
+}
+
+static struct pool_workqueue *get_pwq(unsigned int cpu,
+ struct workqueue_struct *wq)
{
if (!(wq->flags & WQ_UNBOUND)) {
if (likely(cpu < nr_cpu_ids))
- return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+ return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
} else if (likely(cpu == WORK_CPU_UNBOUND))
- return wq->cpu_wq.single;
+ return wq->pool_wq.single;
return NULL;
}
@@ -530,19 +508,19 @@ static int work_next_color(int color)
}
/*
- * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
- * contain the pointer to the queued cwq. Once execution starts, the flag
- * is cleared and the high bits contain OFFQ flags and CPU number.
+ * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
+ * contain the pointer to the queued pwq. Once execution starts, the flag
+ * is cleared and the high bits contain OFFQ flags and pool ID.
*
- * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
- * and clear_work_data() can be used to set the cwq, cpu or clear
+ * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
+ * and clear_work_data() can be used to set the pwq, pool or clear
* work->data. These functions should only be called while the work is
* owned - ie. while the PENDING bit is set.
*
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
- * a work. gcwq is available once the work has been queued anywhere after
- * initialization until it is sync canceled. cwq is available only while
- * the work item is queued.
+ * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
+ * corresponding to a work. Pool is available once the work has been
+ * queued anywhere after initialization until it is sync canceled. pwq is
+ * available only while the work item is queued.
*
* %WORK_OFFQ_CANCELING is used to mark a work item which is being
* canceled. While being canceled, a work item may have its PENDING set
@@ -556,16 +534,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data,
atomic_long_set(&work->data, data | flags | work_static(work));
}
-static void set_work_cwq(struct work_struct *work,
- struct cpu_workqueue_struct *cwq,
+static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
unsigned long extra_flags)
{
- set_work_data(work, (unsigned long)cwq,
- WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
+ set_work_data(work, (unsigned long)pwq,
+ WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
}
-static void set_work_cpu_and_clear_pending(struct work_struct *work,
- unsigned int cpu)
+static void set_work_pool_and_keep_pending(struct work_struct *work,
+ int pool_id)
+{
+ set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
+ WORK_STRUCT_PENDING);
+}
+
+static void set_work_pool_and_clear_pending(struct work_struct *work,
+ int pool_id)
{
/*
* The following wmb is paired with the implied mb in
@@ -574,67 +558,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work,
* owner.
*/
smp_wmb();
- set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
+ set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
}
static void clear_work_data(struct work_struct *work)
{
- smp_wmb(); /* see set_work_cpu_and_clear_pending() */
- set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+ smp_wmb(); /* see set_work_pool_and_clear_pending() */
+ set_work_data(work, WORK_STRUCT_NO_POOL, 0);
}
-static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
+static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
- if (data & WORK_STRUCT_CWQ)
+ if (data & WORK_STRUCT_PWQ)
return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
else
return NULL;
}
-static struct global_cwq *get_work_gcwq(struct work_struct *work)
+/**
+ * get_work_pool - return the worker_pool a given work was associated with
+ * @work: the work item of interest
+ *
+ * Return the worker_pool @work was last associated with. %NULL if none.
+ */
+static struct worker_pool *get_work_pool(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
- unsigned int cpu;
+ struct worker_pool *pool;
+ int pool_id;
- if (data & WORK_STRUCT_CWQ)
- return ((struct cpu_workqueue_struct *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
+ if (data & WORK_STRUCT_PWQ)
+ return ((struct pool_workqueue *)
+ (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
- cpu = data >> WORK_OFFQ_CPU_SHIFT;
- if (cpu == WORK_CPU_NONE)
+ pool_id = data >> WORK_OFFQ_POOL_SHIFT;
+ if (pool_id == WORK_OFFQ_POOL_NONE)
return NULL;
- BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
- return get_gcwq(cpu);
+ pool = worker_pool_by_id(pool_id);
+ WARN_ON_ONCE(!pool);
+ return pool;
+}
+
+/**
+ * get_work_pool_id - return the worker pool ID a given work is associated with
+ * @work: the work item of interest
+ *
+ * Return the worker_pool ID @work was last associated with.
+ * %WORK_OFFQ_POOL_NONE if none.
+ */
+static int get_work_pool_id(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+
+ if (data & WORK_STRUCT_PWQ)
+ return ((struct pool_workqueue *)
+ (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+
+ return data >> WORK_OFFQ_POOL_SHIFT;
}
static void mark_work_canceling(struct work_struct *work)
{
- struct global_cwq *gcwq = get_work_gcwq(work);
- unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
+ unsigned long pool_id = get_work_pool_id(work);
- set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
- WORK_STRUCT_PENDING);
+ pool_id <<= WORK_OFFQ_POOL_SHIFT;
+ set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
}
static bool work_is_canceling(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
- return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
+ return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
}
/*
* Policy functions. These define the policies on how the global worker
* pools are managed. Unless noted otherwise, these functions assume that
- * they're being called with gcwq->lock held.
+ * they're being called with pool->lock held.
*/
static bool __need_more_worker(struct worker_pool *pool)
{
- return !atomic_read(get_pool_nr_running(pool));
+ return !atomic_read(&pool->nr_running);
}
/*
@@ -642,7 +651,7 @@ static bool __need_more_worker(struct worker_pool *pool)
* running workers.
*
* Note that, because unbound workers never contribute to nr_running, this
- * function will always return %true for unbound gcwq as long as the
+ * function will always return %true for unbound pools as long as the
* worklist isn't empty.
*/
static bool need_more_worker(struct worker_pool *pool)
@@ -659,9 +668,8 @@ static bool may_start_working(struct worker_pool *pool)
/* Do I need to keep working? Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
- atomic_t *nr_running = get_pool_nr_running(pool);
-
- return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
+ return !list_empty(&pool->worklist) &&
+ atomic_read(&pool->nr_running) <= 1;
}
/* Do we need a new worker? Called from manager. */
@@ -714,7 +722,7 @@ static struct worker *first_worker(struct worker_pool *pool)
* Wake up the first idle worker of @pool.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
static void wake_up_worker(struct worker_pool *pool)
{
@@ -740,8 +748,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
struct worker *worker = kthread_data(task);
if (!(worker->flags & WORKER_NOT_RUNNING)) {
- WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu);
- atomic_inc(get_pool_nr_running(worker->pool));
+ WARN_ON_ONCE(worker->pool->cpu != cpu);
+ atomic_inc(&worker->pool->nr_running);
}
}
@@ -764,12 +772,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
unsigned int cpu)
{
struct worker *worker = kthread_data(task), *to_wakeup = NULL;
- struct worker_pool *pool = worker->pool;
- atomic_t *nr_running = get_pool_nr_running(pool);
+ struct worker_pool *pool;
+ /*
+ * Rescuers, which may not have all the fields set up like normal
+ * workers, also reach here, let's not access anything before
+ * checking NOT_RUNNING.
+ */
if (worker->flags & WORKER_NOT_RUNNING)
return NULL;
+ pool = worker->pool;
+
/* this can only happen on the local cpu */
BUG_ON(cpu != raw_smp_processor_id());
@@ -781,10 +795,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
* NOT_RUNNING is clear. This means that we're bound to and
* running on the local cpu w/ rq lock held and preemption
* disabled, which in turn means that none else could be
- * manipulating idle_list, so dereferencing idle_list without gcwq
+ * manipulating idle_list, so dereferencing idle_list without pool
* lock is safe.
*/
- if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
+ if (atomic_dec_and_test(&pool->nr_running) &&
+ !list_empty(&pool->worklist))
to_wakeup = first_worker(pool);
return to_wakeup ? to_wakeup->task : NULL;
}
@@ -800,7 +815,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
* woken up.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags,
bool wakeup)
@@ -816,14 +831,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
*/
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- atomic_t *nr_running = get_pool_nr_running(pool);
-
if (wakeup) {
- if (atomic_dec_and_test(nr_running) &&
+ if (atomic_dec_and_test(&pool->nr_running) &&
!list_empty(&pool->worklist))
wake_up_worker(pool);
} else
- atomic_dec(nr_running);
+ atomic_dec(&pool->nr_running);
}
worker->flags |= flags;
@@ -837,7 +850,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
* Clear @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
@@ -855,87 +868,55 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
*/
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(get_pool_nr_running(pool));
+ atomic_inc(&pool->nr_running);
}
/**
- * busy_worker_head - return the busy hash head for a work
- * @gcwq: gcwq of interest
- * @work: work to be hashed
- *
- * Return hash head of @gcwq for @work.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to the hash head.
- */
-static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
- struct work_struct *work)
-{
- const int base_shift = ilog2(sizeof(struct work_struct));
- unsigned long v = (unsigned long)work;
-
- /* simple shift and fold hash, do we need something better? */
- v >>= base_shift;
- v += v >> BUSY_WORKER_HASH_ORDER;
- v &= BUSY_WORKER_HASH_MASK;
-
- return &gcwq->busy_hash[v];
-}
-
-/**
- * __find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
- * @bwh: hash head as returned by busy_worker_head()
+ * find_worker_executing_work - find worker which is executing a work
+ * @pool: pool of interest
* @work: work to find worker for
*
- * Find a worker which is executing @work on @gcwq. @bwh should be
- * the hash head obtained by calling busy_worker_head() with the same
- * work.
+ * Find a worker which is executing @work on @pool by searching
+ * @pool->busy_hash which is keyed by the address of @work. For a worker
+ * to match, its current execution should match the address of @work and
+ * its work function. This is to avoid unwanted dependency between
+ * unrelated work executions through a work item being recycled while still
+ * being executed.
+ *
+ * This is a bit tricky. A work item may be freed once its execution
+ * starts and nothing prevents the freed area from being recycled for
+ * another work item. If the same work item address ends up being reused
+ * before the original execution finishes, workqueue will identify the
+ * recycled work item as currently executing and make it wait until the
+ * current execution finishes, introducing an unwanted dependency.
+ *
+ * This function checks the work item address, work function and workqueue
+ * to avoid false positives. Note that this isn't complete as one may
+ * construct a work function which can introduce dependency onto itself
+ * through a recycled work item. Well, if somebody wants to shoot oneself
+ * in the foot that badly, there's only so much we can do, and if such
+ * deadlock actually occurs, it should be easy to locate the culprit work
+ * function.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*
* RETURNS:
* Pointer to worker which is executing @work if found, NULL
* otherwise.
*/
-static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
- struct hlist_head *bwh,
- struct work_struct *work)
+static struct worker *find_worker_executing_work(struct worker_pool *pool,
+ struct work_struct *work)
{
struct worker *worker;
- struct hlist_node *tmp;
- hlist_for_each_entry(worker, tmp, bwh, hentry)
- if (worker->current_work == work)
+ hash_for_each_possible(pool->busy_hash, worker, hentry,
+ (unsigned long)work)
+ if (worker->current_work == work &&
+ worker->current_func == work->func)
return worker;
- return NULL;
-}
-/**
- * find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
- * @work: work to find worker for
- *
- * Find a worker which is executing @work on @gcwq. This function is
- * identical to __find_worker_executing_work() except that this
- * function calculates @bwh itself.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to worker which is executing @work if found, NULL
- * otherwise.
- */
-static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
- struct work_struct *work)
-{
- return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
- work);
+ return NULL;
}
/**
@@ -953,7 +934,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
* nested inside outer list_for_each_entry_safe().
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
static void move_linked_works(struct work_struct *work, struct list_head *head,
struct work_struct **nextp)
@@ -979,67 +960,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
*nextp = n;
}
-static void cwq_activate_delayed_work(struct work_struct *work)
+static void pwq_activate_delayed_work(struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+ struct pool_workqueue *pwq = get_work_pwq(work);
trace_workqueue_activate_work(work);
- move_linked_works(work, &cwq->pool->worklist, NULL);
+ move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
- cwq->nr_active++;
+ pwq->nr_active++;
}
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
{
- struct work_struct *work = list_first_entry(&cwq->delayed_works,
+ struct work_struct *work = list_first_entry(&pwq->delayed_works,
struct work_struct, entry);
- cwq_activate_delayed_work(work);
+ pwq_activate_delayed_work(work);
}
/**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
- * @cwq: cwq of interest
+ * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
+ * @pwq: pwq of interest
* @color: color of work which left the queue
*
* A work either has completed or is removed from pending queue,
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ * decrement nr_in_flight of its pwq and handle workqueue flushing.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
{
/* ignore uncolored works */
if (color == WORK_NO_COLOR)
return;
- cwq->nr_in_flight[color]--;
+ pwq->nr_in_flight[color]--;
- cwq->nr_active--;
- if (!list_empty(&cwq->delayed_works)) {
+ pwq->nr_active--;
+ if (!list_empty(&pwq->delayed_works)) {
/* one down, submit a delayed one */
- if (cwq->nr_active < cwq->max_active)
- cwq_activate_first_delayed(cwq);
+ if (pwq->nr_active < pwq->max_active)
+ pwq_activate_first_delayed(pwq);
}
/* is flush in progress and are we at the flushing tip? */
- if (likely(cwq->flush_color != color))
+ if (likely(pwq->flush_color != color))
return;
/* are there still in-flight works? */
- if (cwq->nr_in_flight[color])
+ if (pwq->nr_in_flight[color])
return;
- /* this cwq is done, clear flush_color */
- cwq->flush_color = -1;
+ /* this pwq is done, clear flush_color */
+ pwq->flush_color = -1;
/*
- * If this was the last cwq, wake up the first flusher. It
+ * If this was the last pwq, wake up the first flusher. It
* will handle the rest.
*/
- if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
- complete(&cwq->wq->first_flusher->done);
+ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
+ complete(&pwq->wq->first_flusher->done);
}
/**
@@ -1070,7 +1051,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
unsigned long *flags)
{
- struct global_cwq *gcwq;
+ struct worker_pool *pool;
+ struct pool_workqueue *pwq;
local_irq_save(*flags);
@@ -1095,41 +1077,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
* The queueing is in progress, or it is already queued. Try to
* steal it from ->worklist without clearing WORK_STRUCT_PENDING.
*/
- gcwq = get_work_gcwq(work);
- if (!gcwq)
+ pool = get_work_pool(work);
+ if (!pool)
goto fail;
- spin_lock(&gcwq->lock);
- if (!list_empty(&work->entry)) {
+ spin_lock(&pool->lock);
+ /*
+ * work->data is guaranteed to point to pwq only while the work
+ * item is queued on pwq->wq, and both updating work->data to point
+ * to pwq on queueing and to pool on dequeueing are done under
+ * pwq->pool->lock. This in turn guarantees that, if work->data
+ * points to pwq which is associated with a locked pool, the work
+ * item is currently queued on that pool.
+ */
+ pwq = get_work_pwq(work);
+ if (pwq && pwq->pool == pool) {
+ debug_work_deactivate(work);
+
/*
- * This work is queued, but perhaps we locked the wrong gcwq.
- * In that case we must see the new value after rmb(), see
- * insert_work()->wmb().
+ * A delayed work item cannot be grabbed directly because
+ * it might have linked NO_COLOR work items which, if left
+ * on the delayed_list, will confuse pwq->nr_active
+ * management later on and cause stall. Make sure the work
+ * item is activated before grabbing.
*/
- smp_rmb();
- if (gcwq == get_work_gcwq(work)) {
- debug_work_deactivate(work);
+ if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
+ pwq_activate_delayed_work(work);
- /*
- * A delayed work item cannot be grabbed directly
- * because it might have linked NO_COLOR work items
- * which, if left on the delayed_list, will confuse
- * cwq->nr_active management later on and cause
- * stall. Make sure the work item is activated
- * before grabbing.
- */
- if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
- cwq_activate_delayed_work(work);
+ list_del_init(&work->entry);
+ pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
- list_del_init(&work->entry);
- cwq_dec_nr_in_flight(get_work_cwq(work),
- get_work_color(work));
+ /* work->data points to pwq iff queued, point to pool */
+ set_work_pool_and_keep_pending(work, pool->id);
- spin_unlock(&gcwq->lock);
- return 1;
- }
+ spin_unlock(&pool->lock);
+ return 1;
}
- spin_unlock(&gcwq->lock);
+ spin_unlock(&pool->lock);
fail:
local_irq_restore(*flags);
if (work_is_canceling(work))
@@ -1139,33 +1123,25 @@ fail:
}
/**
- * insert_work - insert a work into gcwq
- * @cwq: cwq @work belongs to
+ * insert_work - insert a work into a pool
+ * @pwq: pwq @work belongs to
* @work: work to insert
* @head: insertion point
* @extra_flags: extra WORK_STRUCT_* flags to set
*
- * Insert @work which belongs to @cwq into @gcwq after @head.
- * @extra_flags is or'd to work_struct flags.
+ * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
+ * work_struct flags.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
-static void insert_work(struct cpu_workqueue_struct *cwq,
- struct work_struct *work, struct list_head *head,
- unsigned int extra_flags)
+static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
+ struct list_head *head, unsigned int extra_flags)
{
- struct worker_pool *pool = cwq->pool;
+ struct worker_pool *pool = pwq->pool;
/* we own @work, set data and link */
- set_work_cwq(work, cwq, extra_flags);
-
- /*
- * Ensure that we get the right work->data if we see the
- * result of list_add() below, see try_to_grab_pending().
- */
- smp_wmb();
-
+ set_work_pwq(work, pwq, extra_flags);
list_add_tail(&work->entry, head);
/*
@@ -1181,41 +1157,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
/*
* Test whether @work is being queued from another work executing on the
- * same workqueue. This is rather expensive and should only be used from
- * cold paths.
+ * same workqueue.
*/
static bool is_chained_work(struct workqueue_struct *wq)
{
- unsigned long flags;
- unsigned int cpu;
-
- for_each_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
- struct worker *worker;
- struct hlist_node *pos;
- int i;
+ struct worker *worker;
- spin_lock_irqsave(&gcwq->lock, flags);
- for_each_busy_worker(worker, i, pos, gcwq) {
- if (worker->task != current)
- continue;
- spin_unlock_irqrestore(&gcwq->lock, flags);
- /*
- * I'm @worker, no locking necessary. See if @work
- * is headed to the same workqueue.
- */
- return worker->current_cwq->wq == wq;
- }
- spin_unlock_irqrestore(&gcwq->lock, flags);
- }
- return false;
+ worker = current_wq_worker();
+ /*
+ * Return %true iff I'm a worker execuing a work item on @wq. If
+ * I'm @worker, it's safe to dereference it without locking.
+ */
+ return worker && worker->current_pwq->wq == wq;
}
static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
- struct global_cwq *gcwq;
- struct cpu_workqueue_struct *cwq;
+ struct pool_workqueue *pwq;
struct list_head *worklist;
unsigned int work_flags;
unsigned int req_cpu = cpu;
@@ -1235,9 +1194,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
WARN_ON_ONCE(!is_chained_work(wq)))
return;
- /* determine gcwq to use */
+ /* determine the pwq to use */
if (!(wq->flags & WQ_UNBOUND)) {
- struct global_cwq *last_gcwq;
+ struct worker_pool *last_pool;
if (cpu == WORK_CPU_UNBOUND)
cpu = raw_smp_processor_id();
@@ -1248,55 +1207,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
* work needs to be queued on that cpu to guarantee
* non-reentrancy.
*/
- gcwq = get_gcwq(cpu);
- last_gcwq = get_work_gcwq(work);
+ pwq = get_pwq(cpu, wq);
+ last_pool = get_work_pool(work);
- if (last_gcwq && last_gcwq != gcwq) {
+ if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
- spin_lock(&last_gcwq->lock);
+ spin_lock(&last_pool->lock);
- worker = find_worker_executing_work(last_gcwq, work);
+ worker = find_worker_executing_work(last_pool, work);
- if (worker && worker->current_cwq->wq == wq)
- gcwq = last_gcwq;
- else {
+ if (worker && worker->current_pwq->wq == wq) {
+ pwq = get_pwq(last_pool->cpu, wq);
+ } else {
/* meh... not running there, queue here */
- spin_unlock(&last_gcwq->lock);
- spin_lock(&gcwq->lock);
+ spin_unlock(&last_pool->lock);
+ spin_lock(&pwq->pool->lock);
}
} else {
- spin_lock(&gcwq->lock);
+ spin_lock(&pwq->pool->lock);
}
} else {
- gcwq = get_gcwq(WORK_CPU_UNBOUND);
- spin_lock(&gcwq->lock);
+ pwq = get_pwq(WORK_CPU_UNBOUND, wq);
+ spin_lock(&pwq->pool->lock);
}
- /* gcwq determined, get cwq and queue */
- cwq = get_cwq(gcwq->cpu, wq);
- trace_workqueue_queue_work(req_cpu, cwq, work);
+ /* pwq determined, queue */
+ trace_workqueue_queue_work(req_cpu, pwq, work);
if (WARN_ON(!list_empty(&work->entry))) {
- spin_unlock(&gcwq->lock);
+ spin_unlock(&pwq->pool->lock);
return;
}
- cwq->nr_in_flight[cwq->work_color]++;
- work_flags = work_color_to_flags(cwq->work_color);
+ pwq->nr_in_flight[pwq->work_color]++;
+ work_flags = work_color_to_flags(pwq->work_color);
- if (likely(cwq->nr_active < cwq->max_active)) {
+ if (likely(pwq->nr_active < pwq->max_active)) {
trace_workqueue_activate_work(work);
- cwq->nr_active++;
- worklist = &cwq->pool->worklist;
+ pwq->nr_active++;
+ worklist = &pwq->pool->worklist;
} else {
work_flags |= WORK_STRUCT_DELAYED;
- worklist = &cwq->delayed_works;
+ worklist = &pwq->delayed_works;
}
- insert_work(cwq, work, worklist, work_flags);
+ insert_work(pwq, work, worklist, work_flags);
- spin_unlock(&gcwq->lock);
+ spin_unlock(&pwq->pool->lock);
}
/**
@@ -1347,19 +1305,17 @@ EXPORT_SYMBOL_GPL(queue_work);
void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
- struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
/* should have been called from irqsafe timer with irq already off */
- __queue_work(dwork->cpu, cwq->wq, &dwork->work);
+ __queue_work(dwork->cpu, dwork->wq, &dwork->work);
}
-EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
+EXPORT_SYMBOL(delayed_work_timer_fn);
static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
- unsigned int lcpu;
WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
timer->data != (unsigned long)dwork);
@@ -1379,30 +1335,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
timer_stats_timer_set_start_info(&dwork->timer);
- /*
- * This stores cwq for the moment, for the timer_fn. Note that the
- * work's gcwq is preserved to allow reentrance detection for
- * delayed works.
- */
- if (!(wq->flags & WQ_UNBOUND)) {
- struct global_cwq *gcwq = get_work_gcwq(work);
-
- /*
- * If we cannot get the last gcwq from @work directly,
- * select the last CPU such that it avoids unnecessarily
- * triggering non-reentrancy check in __queue_work().
- */
- lcpu = cpu;
- if (gcwq)
- lcpu = gcwq->cpu;
- if (lcpu == WORK_CPU_UNBOUND)
- lcpu = raw_smp_processor_id();
- } else {
- lcpu = WORK_CPU_UNBOUND;
- }
-
- set_work_cwq(work, get_cwq(lcpu, wq), 0);
-
+ dwork->wq = wq;
dwork->cpu = cpu;
timer->expires = jiffies + delay;
@@ -1519,12 +1452,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);
* necessary.
*
* LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
static void worker_enter_idle(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- struct global_cwq *gcwq = pool->gcwq;
BUG_ON(worker->flags & WORKER_IDLE);
BUG_ON(!list_empty(&worker->entry) &&
@@ -1542,14 +1474,14 @@ static void worker_enter_idle(struct worker *worker)
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/*
- * Sanity check nr_running. Because gcwq_unbind_fn() releases
- * gcwq->lock between setting %WORKER_UNBOUND and zapping
+ * Sanity check nr_running. Because wq_unbind_fn() releases
+ * pool->lock between setting %WORKER_UNBOUND and zapping
* nr_running, the warning may trigger spuriously. Check iff
* unbind is not in progress.
*/
- WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
+ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
pool->nr_workers == pool->nr_idle &&
- atomic_read(get_pool_nr_running(pool)));
+ atomic_read(&pool->nr_running));
}
/**
@@ -1559,7 +1491,7 @@ static void worker_enter_idle(struct worker *worker)
* @worker is leaving idle state. Update stats.
*
* LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
static void worker_leave_idle(struct worker *worker)
{
@@ -1572,7 +1504,7 @@ static void worker_leave_idle(struct worker *worker)
}
/**
- * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
* @worker: self
*
* Works which are scheduled while the cpu is online must at least be
@@ -1584,27 +1516,27 @@ static void worker_leave_idle(struct worker *worker)
* themselves to the target cpu and may race with cpu going down or
* coming online. kthread_bind() can't be used because it may put the
* worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and gcwq may be
+ * verbatim as it's best effort and blocking and pool may be
* [dis]associated in the meantime.
*
- * This function tries set_cpus_allowed() and locks gcwq and verifies the
- * binding against %GCWQ_DISASSOCIATED which is set during
+ * This function tries set_cpus_allowed() and locks pool and verifies the
+ * binding against %POOL_DISASSOCIATED which is set during
* %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
* enters idle state or fetches works without dropping lock, it can
* guarantee the scheduling requirement described in the first paragraph.
*
* CONTEXT:
- * Might sleep. Called without any lock but returns with gcwq->lock
+ * Might sleep. Called without any lock but returns with pool->lock
* held.
*
* RETURNS:
- * %true if the associated gcwq is online (@worker is successfully
+ * %true if the associated pool is online (@worker is successfully
* bound), %false if offline.
*/
static bool worker_maybe_bind_and_lock(struct worker *worker)
-__acquires(&gcwq->lock)
+__acquires(&pool->lock)
{
- struct global_cwq *gcwq = worker->pool->gcwq;
+ struct worker_pool *pool = worker->pool;
struct task_struct *task = worker->task;
while (true) {
@@ -1612,19 +1544,19 @@ __acquires(&gcwq->lock)
* The following call may fail, succeed or succeed
* without actually migrating the task to the cpu if
* it races with cpu hotunplug operation. Verify
- * against GCWQ_DISASSOCIATED.
+ * against POOL_DISASSOCIATED.
*/
- if (!(gcwq->flags & GCWQ_DISASSOCIATED))
- set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+ if (!(pool->flags & POOL_DISASSOCIATED))
+ set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
- spin_lock_irq(&gcwq->lock);
- if (gcwq->flags & GCWQ_DISASSOCIATED)
+ spin_lock_irq(&pool->lock);
+ if (pool->flags & POOL_DISASSOCIATED)
return false;
- if (task_cpu(task) == gcwq->cpu &&
+ if (task_cpu(task) == pool->cpu &&
cpumask_equal(&current->cpus_allowed,
- get_cpu_mask(gcwq->cpu)))
+ get_cpu_mask(pool->cpu)))
return true;
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
/*
* We've raced with CPU hot[un]plug. Give it a breather
@@ -1643,15 +1575,13 @@ __acquires(&gcwq->lock)
*/
static void idle_worker_rebind(struct worker *worker)
{
- struct global_cwq *gcwq = worker->pool->gcwq;
-
/* CPU may go down again inbetween, clear UNBOUND only on success */
if (worker_maybe_bind_and_lock(worker))
worker_clr_flags(worker, WORKER_UNBOUND);
/* rebind complete, become available again */
list_add(&worker->entry, &worker->pool->idle_list);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&worker->pool->lock);
}
/*
@@ -1663,19 +1593,18 @@ static void idle_worker_rebind(struct worker *worker)
static void busy_worker_rebind_fn(struct work_struct *work)
{
struct worker *worker = container_of(work, struct worker, rebind_work);
- struct global_cwq *gcwq = worker->pool->gcwq;
if (worker_maybe_bind_and_lock(worker))
worker_clr_flags(worker, WORKER_UNBOUND);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&worker->pool->lock);
}
/**
- * rebind_workers - rebind all workers of a gcwq to the associated CPU
- * @gcwq: gcwq of interest
+ * rebind_workers - rebind all workers of a pool to the associated CPU
+ * @pool: pool of interest
*
- * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding
+ * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
* is different for idle and busy ones.
*
* Idle ones will be removed from the idle_list and woken up. They will
@@ -1693,38 +1622,31 @@ static void busy_worker_rebind_fn(struct work_struct *work)
* including the manager will not appear on @idle_list until rebind is
* complete, making local wake-ups safe.
*/
-static void rebind_workers(struct global_cwq *gcwq)
+static void rebind_workers(struct worker_pool *pool)
{
- struct worker_pool *pool;
struct worker *worker, *n;
- struct hlist_node *pos;
int i;
- lockdep_assert_held(&gcwq->lock);
-
- for_each_worker_pool(pool, gcwq)
- lockdep_assert_held(&pool->assoc_mutex);
+ lockdep_assert_held(&pool->assoc_mutex);
+ lockdep_assert_held(&pool->lock);
/* dequeue and kick idle ones */
- for_each_worker_pool(pool, gcwq) {
- list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
- /*
- * idle workers should be off @pool->idle_list
- * until rebind is complete to avoid receiving
- * premature local wake-ups.
- */
- list_del_init(&worker->entry);
+ list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
+ /*
+ * idle workers should be off @pool->idle_list until rebind
+ * is complete to avoid receiving premature local wake-ups.
+ */
+ list_del_init(&worker->entry);
- /*
- * worker_thread() will see the above dequeuing
- * and call idle_worker_rebind().
- */
- wake_up_process(worker->task);
- }
+ /*
+ * worker_thread() will see the above dequeuing and call
+ * idle_worker_rebind().
+ */
+ wake_up_process(worker->task);
}
/* rebind busy workers */
- for_each_busy_worker(worker, i, pos, gcwq) {
+ for_each_busy_worker(worker, i, pool) {
struct work_struct *rebind_work = &worker->rebind_work;
struct workqueue_struct *wq;
@@ -1736,16 +1658,16 @@ static void rebind_workers(struct global_cwq *gcwq)
/*
* wq doesn't really matter but let's keep @worker->pool
- * and @cwq->pool consistent for sanity.
+ * and @pwq->pool consistent for sanity.
*/
- if (worker_pool_pri(worker->pool))
+ if (std_worker_pool_pri(worker->pool))
wq = system_highpri_wq;
else
wq = system_wq;
- insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
- worker->scheduled.next,
- work_color_to_flags(WORK_NO_COLOR));
+ insert_work(get_pwq(pool->cpu, wq), rebind_work,
+ worker->scheduled.next,
+ work_color_to_flags(WORK_NO_COLOR));
}
}
@@ -1780,19 +1702,18 @@ static struct worker *alloc_worker(void)
*/
static struct worker *create_worker(struct worker_pool *pool)
{
- struct global_cwq *gcwq = pool->gcwq;
- const char *pri = worker_pool_pri(pool) ? "H" : "";
+ const char *pri = std_worker_pool_pri(pool) ? "H" : "";
struct worker *worker = NULL;
int id = -1;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
while (ida_get_new(&pool->worker_ida, &id)) {
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
goto fail;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
}
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
worker = alloc_worker();
if (!worker)
@@ -1801,30 +1722,30 @@ static struct worker *create_worker(struct worker_pool *pool)
worker->pool = pool;
worker->id = id;
- if (gcwq->cpu != WORK_CPU_UNBOUND)
+ if (pool->cpu != WORK_CPU_UNBOUND)
worker->task = kthread_create_on_node(worker_thread,
- worker, cpu_to_node(gcwq->cpu),
- "kworker/%u:%d%s", gcwq->cpu, id, pri);
+ worker, cpu_to_node(pool->cpu),
+ "kworker/%u:%d%s", pool->cpu, id, pri);
else
worker->task = kthread_create(worker_thread, worker,
"kworker/u:%d%s", id, pri);
if (IS_ERR(worker->task))
goto fail;
- if (worker_pool_pri(pool))
+ if (std_worker_pool_pri(pool))
set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
/*
* Determine CPU binding of the new worker depending on
- * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the
+ * %POOL_DISASSOCIATED. The caller is responsible for ensuring the
* flag remains stable across this function. See the comments
* above the flag definition for details.
*
* As an unbound worker may later become a regular one if CPU comes
* online, make sure every worker has %PF_THREAD_BOUND set.
*/
- if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
- kthread_bind(worker->task, gcwq->cpu);
+ if (!(pool->flags & POOL_DISASSOCIATED)) {
+ kthread_bind(worker->task, pool->cpu);
} else {
worker->task->flags |= PF_THREAD_BOUND;
worker->flags |= WORKER_UNBOUND;
@@ -1833,9 +1754,9 @@ static struct worker *create_worker(struct worker_pool *pool)
return worker;
fail:
if (id >= 0) {
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
ida_remove(&pool->worker_ida, id);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
}
kfree(worker);
return NULL;
@@ -1845,10 +1766,10 @@ fail:
* start_worker - start a newly created worker
* @worker: worker to start
*
- * Make the gcwq aware of @worker and start it.
+ * Make the pool aware of @worker and start it.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
static void start_worker(struct worker *worker)
{
@@ -1862,15 +1783,14 @@ static void start_worker(struct worker *worker)
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
*
- * Destroy @worker and adjust @gcwq stats accordingly.
+ * Destroy @worker and adjust @pool stats accordingly.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
*/
static void destroy_worker(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- struct global_cwq *gcwq = pool->gcwq;
int id = worker->id;
/* sanity check frenzy */
@@ -1885,21 +1805,20 @@ static void destroy_worker(struct worker *worker)
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
kthread_stop(worker->task);
kfree(worker);
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
ida_remove(&pool->worker_ida, id);
}
static void idle_worker_timeout(unsigned long __pool)
{
struct worker_pool *pool = (void *)__pool;
- struct global_cwq *gcwq = pool->gcwq;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
if (too_many_workers(pool)) {
struct worker *worker;
@@ -1918,20 +1837,20 @@ static void idle_worker_timeout(unsigned long __pool)
}
}
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
}
static bool send_mayday(struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq = get_work_cwq(work);
- struct workqueue_struct *wq = cwq->wq;
+ struct pool_workqueue *pwq = get_work_pwq(work);
+ struct workqueue_struct *wq = pwq->wq;
unsigned int cpu;
if (!(wq->flags & WQ_RESCUER))
return false;
/* mayday mayday mayday */
- cpu = cwq->pool->gcwq->cpu;
+ cpu = pwq->pool->cpu;
/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
if (cpu == WORK_CPU_UNBOUND)
cpu = 0;
@@ -1940,13 +1859,12 @@ static bool send_mayday(struct work_struct *work)
return true;
}
-static void gcwq_mayday_timeout(unsigned long __pool)
+static void pool_mayday_timeout(unsigned long __pool)
{
struct worker_pool *pool = (void *)__pool;
- struct global_cwq *gcwq = pool->gcwq;
struct work_struct *work;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
if (need_to_create_worker(pool)) {
/*
@@ -1959,7 +1877,7 @@ static void gcwq_mayday_timeout(unsigned long __pool)
send_mayday(work);
}
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
@@ -1978,24 +1896,22 @@ static void gcwq_mayday_timeout(unsigned long __pool)
* may_start_working() true.
*
* LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
*
* RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
+ * false if no action was taken and pool->lock stayed locked, true
* otherwise.
*/
static bool maybe_create_worker(struct worker_pool *pool)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
+__releases(&pool->lock)
+__acquires(&pool->lock)
{
- struct global_cwq *gcwq = pool->gcwq;
-
if (!need_to_create_worker(pool))
return false;
restart:
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -2006,7 +1922,7 @@ restart:
worker = create_worker(pool);
if (worker) {
del_timer_sync(&pool->mayday_timer);
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
start_worker(worker);
BUG_ON(need_to_create_worker(pool));
return true;
@@ -2023,7 +1939,7 @@ restart:
}
del_timer_sync(&pool->mayday_timer);
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
if (need_to_create_worker(pool))
goto restart;
return true;
@@ -2037,11 +1953,11 @@ restart:
* IDLE_WORKER_TIMEOUT.
*
* LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Called only from manager.
*
* RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
+ * false if no action was taken and pool->lock stayed locked, true
* otherwise.
*/
static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2071,21 +1987,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
* manage_workers - manage worker pool
* @worker: self
*
- * Assume the manager role and manage gcwq worker pool @worker belongs
+ * Assume the manager role and manage the worker pool @worker belongs
* to. At any given time, there can be only zero or one manager per
- * gcwq. The exclusion is handled automatically by this function.
+ * pool. The exclusion is handled automatically by this function.
*
* The caller can safely start processing works on false return. On
* true return, it's guaranteed that need_to_create_worker() is false
* and may_start_working() is true.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations.
*
* RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true if
- * some action was taken.
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * multiple times. Does GFP_KERNEL allocations.
*/
static bool manage_workers(struct worker *worker)
{
@@ -2107,20 +2023,20 @@ static bool manage_workers(struct worker *worker)
* manager against CPU hotplug.
*
* assoc_mutex would always be free unless CPU hotplug is in
- * progress. trylock first without dropping @gcwq->lock.
+ * progress. trylock first without dropping @pool->lock.
*/
if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
- spin_unlock_irq(&pool->gcwq->lock);
+ spin_unlock_irq(&pool->lock);
mutex_lock(&pool->assoc_mutex);
/*
* CPU hotplug could have happened while we were waiting
* for assoc_mutex. Hotplug itself can't handle us
* because manager isn't either on idle or busy list, and
- * @gcwq's state and ours could have deviated.
+ * @pool's state and ours could have deviated.
*
* As hotplug is now excluded via assoc_mutex, we can
* simply try to bind. It will succeed or fail depending
- * on @gcwq's current state. Try it and adjust
+ * on @pool's current state. Try it and adjust
* %WORKER_UNBOUND accordingly.
*/
if (worker_maybe_bind_and_lock(worker))
@@ -2157,18 +2073,15 @@ static bool manage_workers(struct worker *worker)
* call this function to process a work.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
*/
static void process_one_work(struct worker *worker, struct work_struct *work)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
+__releases(&pool->lock)
+__acquires(&pool->lock)
{
- struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+ struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
- struct global_cwq *gcwq = pool->gcwq;
- struct hlist_head *bwh = busy_worker_head(gcwq, work);
- bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
- work_func_t f = work->func;
+ bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
int work_color;
struct worker *collision;
#ifdef CONFIG_LOCKDEP
@@ -2186,11 +2099,11 @@ __acquires(&gcwq->lock)
/*
* Ensure we're on the correct CPU. DISASSOCIATED test is
* necessary to avoid spurious warnings from rescuers servicing the
- * unbound or a disassociated gcwq.
+ * unbound or a disassociated pool.
*/
WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
- !(gcwq->flags & GCWQ_DISASSOCIATED) &&
- raw_smp_processor_id() != gcwq->cpu);
+ !(pool->flags & POOL_DISASSOCIATED) &&
+ raw_smp_processor_id() != pool->cpu);
/*
* A single work shouldn't be executed concurrently by
@@ -2198,7 +2111,7 @@ __acquires(&gcwq->lock)
* already processing the work. If so, defer the work to the
* currently executing one.
*/
- collision = __find_worker_executing_work(gcwq, bwh, work);
+ collision = find_worker_executing_work(pool, work);
if (unlikely(collision)) {
move_linked_works(work, &collision->scheduled, NULL);
return;
@@ -2206,9 +2119,10 @@ __acquires(&gcwq->lock)
/* claim and dequeue */
debug_work_deactivate(work);
- hlist_add_head(&worker->hentry, bwh);
+ hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
worker->current_work = work;
- worker->current_cwq = cwq;
+ worker->current_func = work->func;
+ worker->current_pwq = pwq;
work_color = get_work_color(work);
list_del_init(&work->entry);
@@ -2221,53 +2135,55 @@ __acquires(&gcwq->lock)
worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
/*
- * Unbound gcwq isn't concurrency managed and work items should be
+ * Unbound pool isn't concurrency managed and work items should be
* executed ASAP. Wake up another worker if necessary.
*/
if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
wake_up_worker(pool);
/*
- * Record the last CPU and clear PENDING which should be the last
- * update to @work. Also, do this inside @gcwq->lock so that
+ * Record the last pool and clear PENDING which should be the last
+ * update to @work. Also, do this inside @pool->lock so that
* PENDING and queued state changes happen together while IRQ is
* disabled.
*/
- set_work_cpu_and_clear_pending(work, gcwq->cpu);
+ set_work_pool_and_clear_pending(work, pool->id);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
- lock_map_acquire_read(&cwq->wq->lockdep_map);
+ lock_map_acquire_read(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
trace_workqueue_execute_start(work);
- f(work);
+ worker->current_func(work);
/*
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
*/
trace_workqueue_execute_end(work);
lock_map_release(&lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
+ lock_map_release(&pwq->wq->lockdep_map);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
" last function: %pf\n",
- current->comm, preempt_count(), task_pid_nr(current), f);
+ current->comm, preempt_count(), task_pid_nr(current),
+ worker->current_func);
debug_show_held_locks(current);
dump_stack();
}
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
if (unlikely(cpu_intensive))
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
/* we're done with it, release */
- hlist_del_init(&worker->hentry);
+ hash_del(&worker->hentry);
worker->current_work = NULL;
- worker->current_cwq = NULL;
- cwq_dec_nr_in_flight(cwq, work_color);
+ worker->current_func = NULL;
+ worker->current_pwq = NULL;
+ pwq_dec_nr_in_flight(pwq, work_color);
}
/**
@@ -2279,7 +2195,7 @@ __acquires(&gcwq->lock)
* fetches a work from the top and executes it.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times.
*/
static void process_scheduled_works(struct worker *worker)
@@ -2295,8 +2211,8 @@ static void process_scheduled_works(struct worker *worker)
* worker_thread - the worker thread function
* @__worker: self
*
- * The gcwq worker thread function. There's a single dynamic pool of
- * these per each cpu. These workers process all works regardless of
+ * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools
+ * of these per each cpu. These workers process all works regardless of
* their specific target workqueue. The only exception is works which
* belong to workqueues with a rescuer which will be explained in
* rescuer_thread().
@@ -2305,16 +2221,15 @@ static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
struct worker_pool *pool = worker->pool;
- struct global_cwq *gcwq = pool->gcwq;
/* tell the scheduler that this is a workqueue worker */
worker->task->flags |= PF_WQ_WORKER;
woke_up:
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
/* we are off idle list if destruction or rebind is requested */
if (unlikely(list_empty(&worker->entry))) {
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
/* if DIE is set, destruction is requested */
if (worker->flags & WORKER_DIE) {
@@ -2373,52 +2288,59 @@ sleep:
goto recheck;
/*
- * gcwq->lock is held and there's no work to process and no
- * need to manage, sleep. Workers are woken up only while
- * holding gcwq->lock or from local cpu, so setting the
- * current state before releasing gcwq->lock is enough to
- * prevent losing any event.
+ * pool->lock is held and there's no work to process and no need to
+ * manage, sleep. Workers are woken up only while holding
+ * pool->lock or from local cpu, so setting the current state
+ * before releasing pool->lock is enough to prevent losing any
+ * event.
*/
worker_enter_idle(worker);
__set_current_state(TASK_INTERRUPTIBLE);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}
/**
* rescuer_thread - the rescuer thread function
- * @__wq: the associated workqueue
+ * @__rescuer: self
*
* Workqueue rescuer thread function. There's one rescuer for each
* workqueue which has WQ_RESCUER set.
*
- * Regular work processing on a gcwq may block trying to create a new
+ * Regular work processing on a pool may block trying to create a new
* worker which uses GFP_KERNEL allocation which has slight chance of
* developing into deadlock if some works currently on the same queue
* need to be processed to satisfy the GFP_KERNEL allocation. This is
* the problem rescuer solves.
*
- * When such condition is possible, the gcwq summons rescuers of all
- * workqueues which have works queued on the gcwq and let them process
+ * When such condition is possible, the pool summons rescuers of all
+ * workqueues which have works queued on the pool and let them process
* those works so that forward progress can be guaranteed.
*
* This should happen rarely.
*/
-static int rescuer_thread(void *__wq)
+static int rescuer_thread(void *__rescuer)
{
- struct workqueue_struct *wq = __wq;
- struct worker *rescuer = wq->rescuer;
+ struct worker *rescuer = __rescuer;
+ struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
bool is_unbound = wq->flags & WQ_UNBOUND;
unsigned int cpu;
set_user_nice(current, RESCUER_NICE_LEVEL);
+
+ /*
+ * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
+ * doesn't participate in concurrency management.
+ */
+ rescuer->task->flags |= PF_WQ_WORKER;
repeat:
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
+ rescuer->task->flags &= ~PF_WQ_WORKER;
return 0;
}
@@ -2428,9 +2350,8 @@ repeat:
*/
for_each_mayday_cpu(cpu, wq->mayday_mask) {
unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
- struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
- struct worker_pool *pool = cwq->pool;
- struct global_cwq *gcwq = pool->gcwq;
+ struct pool_workqueue *pwq = get_pwq(tcpu, wq);
+ struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
__set_current_state(TASK_RUNNING);
@@ -2446,22 +2367,24 @@ repeat:
*/
BUG_ON(!list_empty(&rescuer->scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry)
- if (get_work_cwq(work) == cwq)
+ if (get_work_pwq(work) == pwq)
move_linked_works(work, scheduled, &n);
process_scheduled_works(rescuer);
/*
- * Leave this gcwq. If keep_working() is %true, notify a
+ * Leave this pool. If keep_working() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
*/
if (keep_working(pool))
wake_up_worker(pool);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
}
+ /* rescuers should never participate in concurrency management */
+ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
goto repeat;
}
@@ -2479,7 +2402,7 @@ static void wq_barrier_func(struct work_struct *work)
/**
* insert_wq_barrier - insert a barrier work
- * @cwq: cwq to insert barrier into
+ * @pwq: pwq to insert barrier into
* @barr: wq_barrier to insert
* @target: target work to attach @barr to
* @worker: worker currently executing @target, NULL if @target is not executing
@@ -2496,12 +2419,12 @@ static void wq_barrier_func(struct work_struct *work)
* after a work with LINKED flag set.
*
* Note that when @worker is non-NULL, @target may be modified
- * underneath us, so we can't reliably determine cwq from @target.
+ * underneath us, so we can't reliably determine pwq from @target.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
-static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
struct work_struct *target, struct worker *worker)
{
@@ -2509,7 +2432,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
unsigned int linked = 0;
/*
- * debugobject calls are safe here even with gcwq->lock locked
+ * debugobject calls are safe here even with pool->lock locked
* as we know for sure that this will not trigger any of the
* checks and call back into the fixup functions where we
* might deadlock.
@@ -2534,23 +2457,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
}
debug_work_activate(&barr->work);
- insert_work(cwq, &barr->work, head,
+ insert_work(pwq, &barr->work, head,
work_color_to_flags(WORK_NO_COLOR) | linked);
}
/**
- * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
* @wq: workqueue being flushed
* @flush_color: new flush color, < 0 for no-op
* @work_color: new work color, < 0 for no-op
*
- * Prepare cwqs for workqueue flushing.
+ * Prepare pwqs for workqueue flushing.
*
- * If @flush_color is non-negative, flush_color on all cwqs should be
- * -1. If no cwq has in-flight commands at the specified color, all
- * cwq->flush_color's stay at -1 and %false is returned. If any cwq
- * has in flight commands, its cwq->flush_color is set to
- * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * If @flush_color is non-negative, flush_color on all pwqs should be
+ * -1. If no pwq has in-flight commands at the specified color, all
+ * pwq->flush_color's stay at -1 and %false is returned. If any pwq
+ * has in flight commands, its pwq->flush_color is set to
+ * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
* wakeup logic is armed and %true is returned.
*
* The caller should have initialized @wq->first_flusher prior to
@@ -2558,7 +2481,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
* @flush_color is negative, no flush color update is done and %false
* is returned.
*
- * If @work_color is non-negative, all cwqs should have the same
+ * If @work_color is non-negative, all pwqs should have the same
* work_color which is previous to @work_color and all will be
* advanced to @work_color.
*
@@ -2569,42 +2492,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
* %true if @flush_color >= 0 and there's something to flush. %false
* otherwise.
*/
-static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
int flush_color, int work_color)
{
bool wait = false;
unsigned int cpu;
if (flush_color >= 0) {
- BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
- atomic_set(&wq->nr_cwqs_to_flush, 1);
+ BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
+ atomic_set(&wq->nr_pwqs_to_flush, 1);
}
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- struct global_cwq *gcwq = cwq->pool->gcwq;
+ for_each_pwq_cpu(cpu, wq) {
+ struct pool_workqueue *pwq = get_pwq(cpu, wq);
+ struct worker_pool *pool = pwq->pool;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
if (flush_color >= 0) {
- BUG_ON(cwq->flush_color != -1);
+ BUG_ON(pwq->flush_color != -1);
- if (cwq->nr_in_flight[flush_color]) {
- cwq->flush_color = flush_color;
- atomic_inc(&wq->nr_cwqs_to_flush);
+ if (pwq->nr_in_flight[flush_color]) {
+ pwq->flush_color = flush_color;
+ atomic_inc(&wq->nr_pwqs_to_flush);
wait = true;
}
}
if (work_color >= 0) {
- BUG_ON(work_color != work_next_color(cwq->work_color));
- cwq->work_color = work_color;
+ BUG_ON(work_color != work_next_color(pwq->work_color));
+ pwq->work_color = work_color;
}
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
}
- if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+ if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
complete(&wq->first_flusher->done);
return wait;
@@ -2655,7 +2578,7 @@ void flush_workqueue(struct workqueue_struct *wq)
wq->first_flusher = &this_flusher;
- if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+ if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
wq->work_color)) {
/* nothing to flush, done */
wq->flush_color = next_color;
@@ -2666,7 +2589,7 @@ void flush_workqueue(struct workqueue_struct *wq)
/* wait in queue */
BUG_ON(wq->flush_color == this_flusher.flush_color);
list_add_tail(&this_flusher.list, &wq->flusher_queue);
- flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+ flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
}
} else {
/*
@@ -2733,7 +2656,7 @@ void flush_workqueue(struct workqueue_struct *wq)
list_splice_tail_init(&wq->flusher_overflow,
&wq->flusher_queue);
- flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+ flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
}
if (list_empty(&wq->flusher_queue)) {
@@ -2743,7 +2666,7 @@ void flush_workqueue(struct workqueue_struct *wq)
/*
* Need to flush more colors. Make the next flusher
- * the new first flusher and arm cwqs.
+ * the new first flusher and arm pwqs.
*/
BUG_ON(wq->flush_color == wq->work_color);
BUG_ON(wq->flush_color != next->flush_color);
@@ -2751,7 +2674,7 @@ void flush_workqueue(struct workqueue_struct *wq)
list_del_init(&next->list);
wq->first_flusher = next;
- if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+ if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
break;
/*
@@ -2794,13 +2717,13 @@ void drain_workqueue(struct workqueue_struct *wq)
reflush:
flush_workqueue(wq);
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ for_each_pwq_cpu(cpu, wq) {
+ struct pool_workqueue *pwq = get_pwq(cpu, wq);
bool drained;
- spin_lock_irq(&cwq->pool->gcwq->lock);
- drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
- spin_unlock_irq(&cwq->pool->gcwq->lock);
+ spin_lock_irq(&pwq->pool->lock);
+ drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
+ spin_unlock_irq(&pwq->pool->lock);
if (drained)
continue;
@@ -2822,34 +2745,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue);
static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
{
struct worker *worker = NULL;
- struct global_cwq *gcwq;
- struct cpu_workqueue_struct *cwq;
+ struct worker_pool *pool;
+ struct pool_workqueue *pwq;
might_sleep();
- gcwq = get_work_gcwq(work);
- if (!gcwq)
+ pool = get_work_pool(work);
+ if (!pool)
return false;
- spin_lock_irq(&gcwq->lock);
- if (!list_empty(&work->entry)) {
- /*
- * See the comment near try_to_grab_pending()->smp_rmb().
- * If it was re-queued to a different gcwq under us, we
- * are not going to wait.
- */
- smp_rmb();
- cwq = get_work_cwq(work);
- if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
+ spin_lock_irq(&pool->lock);
+ /* see the comment in try_to_grab_pending() with the same code */
+ pwq = get_work_pwq(work);
+ if (pwq) {
+ if (unlikely(pwq->pool != pool))
goto already_gone;
} else {
- worker = find_worker_executing_work(gcwq, work);
+ worker = find_worker_executing_work(pool, work);
if (!worker)
goto already_gone;
- cwq = worker->current_cwq;
+ pwq = worker->current_pwq;
}
- insert_wq_barrier(cwq, barr, work, worker);
- spin_unlock_irq(&gcwq->lock);
+ insert_wq_barrier(pwq, barr, work, worker);
+ spin_unlock_irq(&pool->lock);
/*
* If @max_active is 1 or rescuer is in use, flushing another work
@@ -2857,15 +2775,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
* flusher is not running on the same workqueue by verifying write
* access.
*/
- if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
- lock_map_acquire(&cwq->wq->lockdep_map);
+ if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
+ lock_map_acquire(&pwq->wq->lockdep_map);
else
- lock_map_acquire_read(&cwq->wq->lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
+ lock_map_acquire_read(&pwq->wq->lockdep_map);
+ lock_map_release(&pwq->wq->lockdep_map);
return true;
already_gone:
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
return false;
}
@@ -2961,8 +2879,7 @@ bool flush_delayed_work(struct delayed_work *dwork)
{
local_irq_disable();
if (del_timer_sync(&dwork->timer))
- __queue_work(dwork->cpu,
- get_work_cwq(&dwork->work)->wq, &dwork->work);
+ __queue_work(dwork->cpu, dwork->wq, &dwork->work);
local_irq_enable();
return flush_work(&dwork->work);
}
@@ -2992,7 +2909,8 @@ bool cancel_delayed_work(struct delayed_work *dwork)
if (unlikely(ret < 0))
return false;
- set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
+ set_work_pool_and_clear_pending(&dwork->work,
+ get_work_pool_id(&dwork->work));
local_irq_restore(flags);
return ret;
}
@@ -3171,46 +3089,46 @@ int keventd_up(void)
return system_wq != NULL;
}
-static int alloc_cwqs(struct workqueue_struct *wq)
+static int alloc_pwqs(struct workqueue_struct *wq)
{
/*
- * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+ * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
* Make sure that the alignment isn't lower than that of
* unsigned long long.
*/
- const size_t size = sizeof(struct cpu_workqueue_struct);
+ const size_t size = sizeof(struct pool_workqueue);
const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
__alignof__(unsigned long long));
if (!(wq->flags & WQ_UNBOUND))
- wq->cpu_wq.pcpu = __alloc_percpu(size, align);
+ wq->pool_wq.pcpu = __alloc_percpu(size, align);
else {
void *ptr;
/*
- * Allocate enough room to align cwq and put an extra
+ * Allocate enough room to align pwq and put an extra
* pointer at the end pointing back to the originally
* allocated pointer which will be used for free.
*/
ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
if (ptr) {
- wq->cpu_wq.single = PTR_ALIGN(ptr, align);
- *(void **)(wq->cpu_wq.single + 1) = ptr;
+ wq->pool_wq.single = PTR_ALIGN(ptr, align);
+ *(void **)(wq->pool_wq.single + 1) = ptr;
}
}
/* just in case, make sure it's actually aligned */
- BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
- return wq->cpu_wq.v ? 0 : -ENOMEM;
+ BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
+ return wq->pool_wq.v ? 0 : -ENOMEM;
}
-static void free_cwqs(struct workqueue_struct *wq)
+static void free_pwqs(struct workqueue_struct *wq)
{
if (!(wq->flags & WQ_UNBOUND))
- free_percpu(wq->cpu_wq.pcpu);
- else if (wq->cpu_wq.single) {
- /* the pointer to free is stored right after the cwq */
- kfree(*(void **)(wq->cpu_wq.single + 1));
+ free_percpu(wq->pool_wq.pcpu);
+ else if (wq->pool_wq.single) {
+ /* the pointer to free is stored right after the pwq */
+ kfree(*(void **)(wq->pool_wq.single + 1));
}
}
@@ -3264,27 +3182,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
wq->flags = flags;
wq->saved_max_active = max_active;
mutex_init(&wq->flush_mutex);
- atomic_set(&wq->nr_cwqs_to_flush, 0);
+ atomic_set(&wq->nr_pwqs_to_flush, 0);
INIT_LIST_HEAD(&wq->flusher_queue);
INIT_LIST_HEAD(&wq->flusher_overflow);
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
INIT_LIST_HEAD(&wq->list);
- if (alloc_cwqs(wq) < 0)
+ if (alloc_pwqs(wq) < 0)
goto err;
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- struct global_cwq *gcwq = get_gcwq(cpu);
- int pool_idx = (bool)(flags & WQ_HIGHPRI);
-
- BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
- cwq->pool = &gcwq->pools[pool_idx];
- cwq->wq = wq;
- cwq->flush_color = -1;
- cwq->max_active = max_active;
- INIT_LIST_HEAD(&cwq->delayed_works);
+ for_each_pwq_cpu(cpu, wq) {
+ struct pool_workqueue *pwq = get_pwq(cpu, wq);
+
+ BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
+ pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
+ pwq->wq = wq;
+ pwq->flush_color = -1;
+ pwq->max_active = max_active;
+ INIT_LIST_HEAD(&pwq->delayed_works);
}
if (flags & WQ_RESCUER) {
@@ -3297,7 +3213,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (!rescuer)
goto err;
- rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+ rescuer->rescue_wq = wq;
+ rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
wq->name);
if (IS_ERR(rescuer->task))
goto err;
@@ -3314,8 +3231,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
spin_lock(&workqueue_lock);
if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
- for_each_cwq_cpu(cpu, wq)
- get_cwq(cpu, wq)->max_active = 0;
+ for_each_pwq_cpu(cpu, wq)
+ get_pwq(cpu, wq)->max_active = 0;
list_add(&wq->list, &workqueues);
@@ -3324,7 +3241,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
return wq;
err:
if (wq) {
- free_cwqs(wq);
+ free_pwqs(wq);
free_mayday_mask(wq->mayday_mask);
kfree(wq->rescuer);
kfree(wq);
@@ -3355,14 +3272,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
spin_unlock(&workqueue_lock);
/* sanity check */
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ for_each_pwq_cpu(cpu, wq) {
+ struct pool_workqueue *pwq = get_pwq(cpu, wq);
int i;
for (i = 0; i < WORK_NR_COLORS; i++)
- BUG_ON(cwq->nr_in_flight[i]);
- BUG_ON(cwq->nr_active);
- BUG_ON(!list_empty(&cwq->delayed_works));
+ BUG_ON(pwq->nr_in_flight[i]);
+ BUG_ON(pwq->nr_active);
+ BUG_ON(!list_empty(&pwq->delayed_works));
}
if (wq->flags & WQ_RESCUER) {
@@ -3371,29 +3288,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
kfree(wq->rescuer);
}
- free_cwqs(wq);
+ free_pwqs(wq);
kfree(wq);
}
EXPORT_SYMBOL_GPL(destroy_workqueue);
/**
- * cwq_set_max_active - adjust max_active of a cwq
- * @cwq: target cpu_workqueue_struct
+ * pwq_set_max_active - adjust max_active of a pwq
+ * @pwq: target pool_workqueue
* @max_active: new max_active value.
*
- * Set @cwq->max_active to @max_active and activate delayed works if
+ * Set @pwq->max_active to @max_active and activate delayed works if
* increased.
*
* CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
*/
-static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
+static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
{
- cwq->max_active = max_active;
+ pwq->max_active = max_active;
- while (!list_empty(&cwq->delayed_works) &&
- cwq->nr_active < cwq->max_active)
- cwq_activate_first_delayed(cwq);
+ while (!list_empty(&pwq->delayed_works) &&
+ pwq->nr_active < pwq->max_active)
+ pwq_activate_first_delayed(pwq);
}
/**
@@ -3416,16 +3333,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
wq->saved_max_active = max_active;
- for_each_cwq_cpu(cpu, wq) {
- struct global_cwq *gcwq = get_gcwq(cpu);
+ for_each_pwq_cpu(cpu, wq) {
+ struct pool_workqueue *pwq = get_pwq(cpu, wq);
+ struct worker_pool *pool = pwq->pool;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
if (!(wq->flags & WQ_FREEZABLE) ||
- !(gcwq->flags & GCWQ_FREEZING))
- cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
+ !(pool->flags & POOL_FREEZING))
+ pwq_set_max_active(pwq, max_active);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
}
spin_unlock(&workqueue_lock);
@@ -3446,57 +3364,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
*/
bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
{
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ struct pool_workqueue *pwq = get_pwq(cpu, wq);
- return !list_empty(&cwq->delayed_works);
+ return !list_empty(&pwq->delayed_works);
}
EXPORT_SYMBOL_GPL(workqueue_congested);
/**
- * work_cpu - return the last known associated cpu for @work
- * @work: the work of interest
- *
- * RETURNS:
- * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
- */
-unsigned int work_cpu(struct work_struct *work)
-{
- struct global_cwq *gcwq = get_work_gcwq(work);
-
- return gcwq ? gcwq->cpu : WORK_CPU_NONE;
-}
-EXPORT_SYMBOL_GPL(work_cpu);
-
-/**
* work_busy - test whether a work is currently pending or running
* @work: the work to be tested
*
* Test whether @work is currently pending or running. There is no
* synchronization around this function and the test result is
* unreliable and only useful as advisory hints or for debugging.
- * Especially for reentrant wqs, the pending state might hide the
- * running state.
*
* RETURNS:
* OR'd bitmask of WORK_BUSY_* bits.
*/
unsigned int work_busy(struct work_struct *work)
{
- struct global_cwq *gcwq = get_work_gcwq(work);
+ struct worker_pool *pool = get_work_pool(work);
unsigned long flags;
unsigned int ret = 0;
- if (!gcwq)
- return 0;
-
- spin_lock_irqsave(&gcwq->lock, flags);
-
if (work_pending(work))
ret |= WORK_BUSY_PENDING;
- if (find_worker_executing_work(gcwq, work))
- ret |= WORK_BUSY_RUNNING;
- spin_unlock_irqrestore(&gcwq->lock, flags);
+ if (pool) {
+ spin_lock_irqsave(&pool->lock, flags);
+ if (find_worker_executing_work(pool, work))
+ ret |= WORK_BUSY_RUNNING;
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
return ret;
}
@@ -3506,65 +3405,48 @@ EXPORT_SYMBOL_GPL(work_busy);
* CPU hotplug.
*
* There are two challenges in supporting CPU hotplug. Firstly, there
- * are a lot of assumptions on strong associations among work, cwq and
- * gcwq which make migrating pending and scheduled works very
+ * are a lot of assumptions on strong associations among work, pwq and
+ * pool which make migrating pending and scheduled works very
* difficult to implement without impacting hot paths. Secondly,
- * gcwqs serve mix of short, long and very long running works making
+ * worker pools serve mix of short, long and very long running works making
* blocked draining impractical.
*
- * This is solved by allowing a gcwq to be disassociated from the CPU
+ * This is solved by allowing the pools to be disassociated from the CPU
* running as an unbound one and allowing it to be reattached later if the
* cpu comes back online.
*/
-/* claim manager positions of all pools */
-static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
+static void wq_unbind_fn(struct work_struct *work)
{
- struct worker_pool *pool;
-
- for_each_worker_pool(pool, gcwq)
- mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
- spin_lock_irq(&gcwq->lock);
-}
-
-/* release manager positions */
-static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
-{
- struct worker_pool *pool;
-
- spin_unlock_irq(&gcwq->lock);
- for_each_worker_pool(pool, gcwq)
- mutex_unlock(&pool->assoc_mutex);
-}
-
-static void gcwq_unbind_fn(struct work_struct *work)
-{
- struct global_cwq *gcwq = get_gcwq(smp_processor_id());
+ int cpu = smp_processor_id();
struct worker_pool *pool;
struct worker *worker;
- struct hlist_node *pos;
int i;
- BUG_ON(gcwq->cpu != smp_processor_id());
+ for_each_std_worker_pool(pool, cpu) {
+ BUG_ON(cpu != smp_processor_id());
- gcwq_claim_assoc_and_lock(gcwq);
+ mutex_lock(&pool->assoc_mutex);
+ spin_lock_irq(&pool->lock);
- /*
- * We've claimed all manager positions. Make all workers unbound
- * and set DISASSOCIATED. Before this, all workers except for the
- * ones which are still executing works from before the last CPU
- * down must be on the cpu. After this, they may become diasporas.
- */
- for_each_worker_pool(pool, gcwq)
+ /*
+ * We've claimed all manager positions. Make all workers
+ * unbound and set DISASSOCIATED. Before this, all workers
+ * except for the ones which are still executing works from
+ * before the last CPU down must be on the cpu. After
+ * this, they may become diasporas.
+ */
list_for_each_entry(worker, &pool->idle_list, entry)
worker->flags |= WORKER_UNBOUND;
- for_each_busy_worker(worker, i, pos, gcwq)
- worker->flags |= WORKER_UNBOUND;
+ for_each_busy_worker(worker, i, pool)
+ worker->flags |= WORKER_UNBOUND;
- gcwq->flags |= GCWQ_DISASSOCIATED;
+ pool->flags |= POOL_DISASSOCIATED;
- gcwq_release_assoc_and_unlock(gcwq);
+ spin_unlock_irq(&pool->lock);
+ mutex_unlock(&pool->assoc_mutex);
+ }
/*
* Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3576,16 +3458,16 @@ static void gcwq_unbind_fn(struct work_struct *work)
/*
* Sched callbacks are disabled now. Zap nr_running. After this,
* nr_running stays zero and need_more_worker() and keep_working()
- * are always true as long as the worklist is not empty. @gcwq now
- * behaves as unbound (in terms of concurrency management) gcwq
- * which is served by workers tied to the CPU.
+ * are always true as long as the worklist is not empty. Pools on
+ * @cpu now behave as unbound (in terms of concurrency management)
+ * pools which are served by workers tied to the CPU.
*
* On return from this function, the current worker would trigger
* unbound chain execution of pending work items if other workers
* didn't already.
*/
- for_each_worker_pool(pool, gcwq)
- atomic_set(get_pool_nr_running(pool), 0);
+ for_each_std_worker_pool(pool, cpu)
+ atomic_set(&pool->nr_running, 0);
}
/*
@@ -3597,12 +3479,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct global_cwq *gcwq = get_gcwq(cpu);
struct worker_pool *pool;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- for_each_worker_pool(pool, gcwq) {
+ for_each_std_worker_pool(pool, cpu) {
struct worker *worker;
if (pool->nr_workers)
@@ -3612,18 +3493,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
if (!worker)
return NOTIFY_BAD;
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
start_worker(worker);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
}
break;
case CPU_DOWN_FAILED:
case CPU_ONLINE:
- gcwq_claim_assoc_and_lock(gcwq);
- gcwq->flags &= ~GCWQ_DISASSOCIATED;
- rebind_workers(gcwq);
- gcwq_release_assoc_and_unlock(gcwq);
+ for_each_std_worker_pool(pool, cpu) {
+ mutex_lock(&pool->assoc_mutex);
+ spin_lock_irq(&pool->lock);
+
+ pool->flags &= ~POOL_DISASSOCIATED;
+ rebind_workers(pool);
+
+ spin_unlock_irq(&pool->lock);
+ mutex_unlock(&pool->assoc_mutex);
+ }
break;
}
return NOTIFY_OK;
@@ -3643,7 +3530,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
/* unbinding should happen on the local CPU */
- INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
queue_work_on(cpu, system_highpri_wq, &unbind_work);
flush_work(&unbind_work);
break;
@@ -3696,10 +3583,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
*
* Start freezing workqueues. After this function returns, all freezable
* workqueues will queue new works to their frozen_works list instead of
- * gcwq->worklist.
+ * pool->worklist.
*
* CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases workqueue_lock and pool->lock's.
*/
void freeze_workqueues_begin(void)
{
@@ -3710,23 +3597,26 @@ void freeze_workqueues_begin(void)
BUG_ON(workqueue_freezing);
workqueue_freezing = true;
- for_each_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
+ for_each_wq_cpu(cpu) {
+ struct worker_pool *pool;
struct workqueue_struct *wq;
- spin_lock_irq(&gcwq->lock);
+ for_each_std_worker_pool(pool, cpu) {
+ spin_lock_irq(&pool->lock);
- BUG_ON(gcwq->flags & GCWQ_FREEZING);
- gcwq->flags |= GCWQ_FREEZING;
+ WARN_ON_ONCE(pool->flags & POOL_FREEZING);
+ pool->flags |= POOL_FREEZING;
- list_for_each_entry(wq, &workqueues, list) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ list_for_each_entry(wq, &workqueues, list) {
+ struct pool_workqueue *pwq = get_pwq(cpu, wq);
- if (cwq && wq->flags & WQ_FREEZABLE)
- cwq->max_active = 0;
- }
+ if (pwq && pwq->pool == pool &&
+ (wq->flags & WQ_FREEZABLE))
+ pwq->max_active = 0;
+ }
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
+ }
}
spin_unlock(&workqueue_lock);
@@ -3754,20 +3644,20 @@ bool freeze_workqueues_busy(void)
BUG_ON(!workqueue_freezing);
- for_each_gcwq_cpu(cpu) {
+ for_each_wq_cpu(cpu) {
struct workqueue_struct *wq;
/*
* nr_active is monotonically decreasing. It's safe
* to peek without lock.
*/
list_for_each_entry(wq, &workqueues, list) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ struct pool_workqueue *pwq = get_pwq(cpu, wq);
- if (!cwq || !(wq->flags & WQ_FREEZABLE))
+ if (!pwq || !(wq->flags & WQ_FREEZABLE))
continue;
- BUG_ON(cwq->nr_active < 0);
- if (cwq->nr_active) {
+ BUG_ON(pwq->nr_active < 0);
+ if (pwq->nr_active) {
busy = true;
goto out_unlock;
}
@@ -3782,10 +3672,10 @@ out_unlock:
* thaw_workqueues - thaw workqueues
*
* Thaw workqueues. Normal queueing is restored and all collected
- * frozen works are transferred to their respective gcwq worklists.
+ * frozen works are transferred to their respective pool worklists.
*
* CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases workqueue_lock and pool->lock's.
*/
void thaw_workqueues(void)
{
@@ -3796,30 +3686,31 @@ void thaw_workqueues(void)
if (!workqueue_freezing)
goto out_unlock;
- for_each_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
+ for_each_wq_cpu(cpu) {
struct worker_pool *pool;
struct workqueue_struct *wq;
- spin_lock_irq(&gcwq->lock);
+ for_each_std_worker_pool(pool, cpu) {
+ spin_lock_irq(&pool->lock);
- BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
- gcwq->flags &= ~GCWQ_FREEZING;
+ WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
+ pool->flags &= ~POOL_FREEZING;
- list_for_each_entry(wq, &workqueues, list) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ list_for_each_entry(wq, &workqueues, list) {
+ struct pool_workqueue *pwq = get_pwq(cpu, wq);
- if (!cwq || !(wq->flags & WQ_FREEZABLE))
- continue;
+ if (!pwq || pwq->pool != pool ||
+ !(wq->flags & WQ_FREEZABLE))
+ continue;
- /* restore max_active and repopulate worklist */
- cwq_set_max_active(cwq, wq->saved_max_active);
- }
+ /* restore max_active and repopulate worklist */
+ pwq_set_max_active(pwq, wq->saved_max_active);
+ }
- for_each_worker_pool(pool, gcwq)
wake_up_worker(pool);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
+ }
}
workqueue_freezing = false;
@@ -3831,60 +3722,56 @@ out_unlock:
static int __init init_workqueues(void)
{
unsigned int cpu;
- int i;
- /* make sure we have enough bits for OFFQ CPU number */
- BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
- WORK_CPU_LAST);
+ /* make sure we have enough bits for OFFQ pool ID */
+ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
+ WORK_CPU_END * NR_STD_WORKER_POOLS);
cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
- /* initialize gcwqs */
- for_each_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
+ /* initialize CPU pools */
+ for_each_wq_cpu(cpu) {
struct worker_pool *pool;
- spin_lock_init(&gcwq->lock);
- gcwq->cpu = cpu;
- gcwq->flags |= GCWQ_DISASSOCIATED;
-
- for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
-
- for_each_worker_pool(pool, gcwq) {
- pool->gcwq = gcwq;
+ for_each_std_worker_pool(pool, cpu) {
+ spin_lock_init(&pool->lock);
+ pool->cpu = cpu;
+ pool->flags |= POOL_DISASSOCIATED;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
+ hash_init(pool->busy_hash);
init_timer_deferrable(&pool->idle_timer);
pool->idle_timer.function = idle_worker_timeout;
pool->idle_timer.data = (unsigned long)pool;
- setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+ setup_timer(&pool->mayday_timer, pool_mayday_timeout,
(unsigned long)pool);
mutex_init(&pool->assoc_mutex);
ida_init(&pool->worker_ida);
+
+ /* alloc pool ID */
+ BUG_ON(worker_pool_assign_id(pool));
}
}
/* create the initial worker */
- for_each_online_gcwq_cpu(cpu) {
- struct global_cwq *gcwq = get_gcwq(cpu);
+ for_each_online_wq_cpu(cpu) {
struct worker_pool *pool;
- if (cpu != WORK_CPU_UNBOUND)
- gcwq->flags &= ~GCWQ_DISASSOCIATED;
-
- for_each_worker_pool(pool, gcwq) {
+ for_each_std_worker_pool(pool, cpu) {
struct worker *worker;
+ if (cpu != WORK_CPU_UNBOUND)
+ pool->flags &= ~POOL_DISASSOCIATED;
+
worker = create_worker(pool);
BUG_ON(!worker);
- spin_lock_irq(&gcwq->lock);
+ spin_lock_irq(&pool->lock);
start_worker(worker);
- spin_unlock_irq(&gcwq->lock);
+ spin_unlock_irq(&pool->lock);
}
}
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 000000000000..07650264ec15
--- /dev/null
+++ b/kernel/workqueue_internal.h
@@ -0,0 +1,65 @@
+/*
+ * kernel/workqueue_internal.h
+ *
+ * Workqueue internal header file. Only to be included by workqueue and
+ * core kernel subsystems.
+ */
+#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
+#define _KERNEL_WORKQUEUE_INTERNAL_H
+
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+
+struct worker_pool;
+
+/*
+ * The poor guys doing the actual heavy lifting. All on-duty workers are
+ * either serving the manager role, on idle list or on busy hash. For
+ * details on the locking annotation (L, I, X...), refer to workqueue.c.
+ *
+ * Only to be used in workqueue and async.
+ */
+struct worker {
+ /* on idle list while idle, on busy hash table while busy */
+ union {
+ struct list_head entry; /* L: while idle */
+ struct hlist_node hentry; /* L: while busy */
+ };
+
+ struct work_struct *current_work; /* L: work being processed */
+ work_func_t current_func; /* L: current_work's fn */
+ struct pool_workqueue *current_pwq; /* L: current_work's pwq */
+ struct list_head scheduled; /* L: scheduled works */
+ struct task_struct *task; /* I: worker task */
+ struct worker_pool *pool; /* I: the associated pool */
+ /* 64 bytes boundary on 64bit, 32 on 32bit */
+ unsigned long last_active; /* L: last active timestamp */
+ unsigned int flags; /* X: flags */
+ int id; /* I: worker id */
+
+ /* for rebinding worker to CPU */
+ struct work_struct rebind_work; /* L: for busy worker */
+
+ /* used only by rescuers to point to the target workqueue */
+ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
+};
+
+/**
+ * current_wq_worker - return struct worker if %current is a workqueue worker
+ */
+static inline struct worker *current_wq_worker(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return kthread_data(current);
+ return NULL;
+}
+
+/*
+ * Scheduler hooks for concurrency managed workqueue. Only to be used from
+ * sched.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+ unsigned int cpu);
+
+#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
deleted file mode 100644
index 2d10fc98dc79..000000000000
--- a/kernel/workqueue_sched.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * kernel/workqueue_sched.h
- *
- * Scheduler hooks for concurrency managed workqueue. Only to be
- * included from sched.c and workqueue.c.
- */
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
- unsigned int cpu);