diff options
Diffstat (limited to 'kernel/rcutree_plugin.h')
-rw-r--r-- | kernel/rcutree_plugin.h | 568 |
1 files changed, 447 insertions, 121 deletions
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a3638710dc67..3f6559a5f5cd 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1,7 +1,7 @@ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO - "\tRCU-based detection of stalled CPUs is disabled.\n"); -#endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif @@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void) struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +static struct rcu_state *rcu_state = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); @@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); + printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Record a preemptable-RCU quiescent state for the specified CPU. Note + * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. @@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu) * We have entered the scheduler, and the current task might soon be * context-switched away from. If this task is in an RCU read-side * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the appropriate entry - * of the blocked_tasks[] array. The task will dequeue itself when - * it exits the outermost enclosing RCU read-side critical section. - * Therefore, the current grace period cannot be permitted to complete - * until the blocked_tasks[] entry indexed by the low-order bit of - * rnp->gpnum empties. + * record that fact, so we enqueue the task on the blkd_tasks list. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the blkd_tasks list entries + * predating the current grace period drain, in other words, until + * rnp->gp_tasks becomes NULL. * * Caller must disable preemption. */ @@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; - int phase; struct rcu_data *rdp; struct rcu_node *rnp; @@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu) * (i.e., this CPU has not yet passed through a quiescent * state for the current grace period), then as long * as that task remains queued, the current grace period - * cannot end. + * cannot end. Note that there is some uncertainty as + * to exactly when the current grace period started. + * We take a conservative approach, which can result + * in unnecessarily waiting on tasks that started very + * slightly after the current grace period began. C'est + * la vie!!! * * But first, note that the current CPU must still be * on line! */ WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; - list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); + if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { + list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); + rnp->gp_tasks = &t->rcu_node_entry; +#ifdef CONFIG_RCU_BOOST + if (rnp->boost_tasks != NULL) + rnp->boost_tasks = rnp->gp_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + } else { + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + if (rnp->qsmask & rdp->grpmask) + rnp->gp_tasks = &t->rcu_node_entry; + } raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu) } /* - * Tree-preemptable RCU implementation for rcu_read_lock(). + * Tree-preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated * if we block. */ @@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); * for the specified rcu_node structure. If the caller needs a reliable * answer, it must hold the rcu_node's ->lock. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { - int phase = rnp->gpnum & 0x1; - - return !list_empty(&rnp->blocked_tasks[phase]) || - !list_empty(&rnp->blocked_tasks[phase + 2]); + return rnp->gp_tasks != NULL; } /* @@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) unsigned long mask; struct rcu_node *rnp_p; - if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; /* Still need more quiescent states! */ } @@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) } /* + * Advance a ->blkd_tasks-list pointer to the next entry, instead + * returning NULL if at the end of the list. + */ +static struct list_head *rcu_next_node_entry(struct task_struct *t, + struct rcu_node *rnp) +{ + struct list_head *np; + + np = t->rcu_node_entry.next; + if (np == &rnp->blkd_tasks) + np = NULL; + return np; +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. @@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t) int empty; int empty_exp; unsigned long flags; + struct list_head *np; struct rcu_node *rnp; int special; @@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempted_readers(rnp); + empty = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ + np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp->gp_tasks = np; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp->exp_tasks = np; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; +#endif /* #ifdef CONFIG_RCU_BOOST */ t->rcu_blocked_node = NULL; /* @@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t) else rcu_report_unblock_qs_rnp(rnp, flags); +#ifdef CONFIG_RCU_BOOST + /* Unboost if we were boosted. */ + if (special & RCU_READ_UNLOCK_BOOSTED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; + rt_mutex_unlock(t->rcu_boost_mutex); + t->rcu_boost_mutex = NULL; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ + /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. @@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* - * Tree-preemptable RCU implementation for rcu_read_unlock(). + * Tree-preemptible RCU implementation for rcu_read_unlock(). * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then * invoke rcu_read_unlock_special() to clean up after a context switch @@ -356,8 +398,6 @@ void __rcu_read_unlock(void) } EXPORT_SYMBOL_GPL(__rcu_read_unlock); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - #ifdef CONFIG_RCU_CPU_STALL_VERBOSE /* @@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock); static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) { unsigned long flags; - struct list_head *lp; - int phase; struct task_struct *t; - if (rcu_preempted_readers(rnp)) { - raw_spin_lock_irqsave(&rnp->lock, flags); - phase = rnp->gpnum & 0x1; - lp = &rnp->blocked_tasks[phase]; - list_for_each_entry(t, lp, rcu_node_entry) - sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return; + raw_spin_lock_irqsave(&rnp->lock, flags); + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + sched_show_task(t); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) */ static void rcu_print_task_stall(struct rcu_node *rnp) { - struct list_head *lp; - int phase; struct task_struct *t; - if (rcu_preempted_readers(rnp)) { - phase = rnp->gpnum & 0x1; - lp = &rnp->blocked_tasks[phase]; - list_for_each_entry(t, lp, rcu_node_entry) - printk(" P%d", t->pid); - } + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return; + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + printk(" P%d", t->pid); } /* @@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void) rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; } -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock * must be held by the caller. + * + * Also, if there are blocked tasks on the list, they automatically + * block the newly created grace period, so set up ->gp_tasks accordingly. */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - WARN_ON_ONCE(rcu_preempted_readers(rnp)); + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (!list_empty(&rnp->blkd_tasks)) + rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } @@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i; struct list_head *lp; struct list_head *lp_root; int retval = 0; struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *tp; + struct task_struct *t; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); return 0; /* Shouldn't happen: at least one CPU online. */ } - WARN_ON_ONCE(rnp != rdp->mynode && - (!list_empty(&rnp->blocked_tasks[0]) || - !list_empty(&rnp->blocked_tasks[1]) || - !list_empty(&rnp->blocked_tasks[2]) || - !list_empty(&rnp->blocked_tasks[3]))); + + /* If we are on an internal node, complain bitterly. */ + WARN_ON_ONCE(rnp != rdp->mynode); /* - * Move tasks up to root rcu_node. Rely on the fact that the - * root rcu_node can be at most one ahead of the rest of the - * rcu_nodes in terms of gp_num value. This fact allows us to - * move the blocked_tasks[] array directly, element by element. + * Move tasks up to root rcu_node. Don't try to get fancy for + * this corner-case operation -- just put this node's tasks + * at the head of the root node's list, and update the root node's + * ->gp_tasks and ->exp_tasks pointers to those of this node's, + * if non-NULL. This might result in waiting for more tasks than + * absolutely necessary, but this is a good performance/complexity + * tradeoff. */ - if (rcu_preempted_readers(rnp)) + if (rcu_preempt_blocked_readers_cgp(rnp)) retval |= RCU_OFL_TASKS_NORM_GP; if (rcu_preempted_readers_exp(rnp)) retval |= RCU_OFL_TASKS_EXP_GP; - for (i = 0; i < 4; i++) { - lp = &rnp->blocked_tasks[i]; - lp_root = &rnp_root->blocked_tasks[i]; - while (!list_empty(lp)) { - tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - list_del(&tp->rcu_node_entry); - tp->rcu_blocked_node = rnp_root; - list_add(&tp->rcu_node_entry, lp_root); - raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ - } + lp = &rnp->blkd_tasks; + lp_root = &rnp_root->blkd_tasks; + while (!list_empty(lp)) { + t = list_entry(lp->next, typeof(*t), rcu_node_entry); + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + list_del(&t->rcu_node_entry); + t->rcu_blocked_node = rnp_root; + list_add(&t->rcu_node_entry, lp_root); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp_root->gp_tasks = rnp->gp_tasks; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp_root->exp_tasks = rnp->exp_tasks; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp_root->boost_tasks = rnp->boost_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ } + +#ifdef CONFIG_RCU_BOOST + /* In case root is being boosted and leaf is not. */ + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + if (rnp_root->boost_tasks != NULL && + rnp_root->boost_tasks != rnp_root->gp_tasks) + rnp_root->boost_tasks = rnp_root->gp_tasks; + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + + rnp->gp_tasks = NULL; + rnp->exp_tasks = NULL; return retval; } /* - * Do CPU-offline processing for preemptable RCU. + * Do CPU-offline processing for preemptible RCU. */ static void rcu_preempt_offline_cpu(int cpu) { @@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Process callbacks for preemptable RCU. + * Process callbacks for preemptible RCU. */ static void rcu_preempt_process_callbacks(void) { @@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void) } /* - * Queue a preemptable-RCU callback for invocation after a grace period. + * Queue a preemptible-RCU callback for invocation after a grace period. */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { @@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); */ static int rcu_preempted_readers_exp(struct rcu_node *rnp) { - return !list_empty(&rnp->blocked_tasks[2]) || - !list_empty(&rnp->blocked_tasks[3]); + return rnp->exp_tasks != NULL; } /* @@ -655,13 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) static void sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) { - int must_wait; + unsigned long flags; + int must_wait = 0; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); - list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); - must_wait = rcu_preempted_readers_exp(rnp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_lock_irqsave(&rnp->lock, flags); + if (list_empty(&rnp->blkd_tasks)) + raw_spin_unlock_irqrestore(&rnp->lock, flags); + else { + rnp->exp_tasks = rnp->blkd_tasks.next; + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + must_wait = 1; + } if (!must_wait) rcu_report_exp_rnp(rsp, rnp); } @@ -669,9 +729,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) /* * Wait for an rcu-preempt grace period, but expedite it. The basic idea * is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blocked_tasks[] lists, move all entries from the first set of - * ->blocked_tasks[] lists to the second set, and finally wait for this - * second set to drain. + * the ->blkd_tasks lists and wait for this list to drain. */ void synchronize_rcu_expedited(void) { @@ -703,7 +761,7 @@ void synchronize_rcu_expedited(void) if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) goto unlock_mb_ret; /* Others did our work for us. */ - /* force all RCU readers onto blocked_tasks[]. */ + /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -715,7 +773,7 @@ void synchronize_rcu_expedited(void) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - /* Snapshot current state of ->blocked_tasks[] lists. */ + /* Snapshot current state of ->blkd_tasks lists. */ rcu_for_each_leaf_node(rsp, rnp) sync_rcu_preempt_exp_init(rsp, rnp); if (NUM_RCU_NODES > 1) @@ -723,7 +781,7 @@ void synchronize_rcu_expedited(void) raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ + /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); wait_event(sync_rcu_preempt_exp_wq, sync_rcu_preempt_exp_done(rnp)); @@ -739,7 +797,7 @@ mb_ret: EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); /* - * Check to see if there is any immediate preemptable-RCU-related work + * Check to see if there is any immediate preemptible-RCU-related work * to be done. */ static int rcu_preempt_pending(int cpu) @@ -749,7 +807,7 @@ static int rcu_preempt_pending(int cpu) } /* - * Does preemptable RCU need the CPU to stay out of dynticks mode? + * Does preemptible RCU need the CPU to stay out of dynticks mode? */ static int rcu_preempt_needs_cpu(int cpu) { @@ -766,7 +824,7 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Initialize preemptable RCU's per-CPU data. + * Initialize preemptible RCU's per-CPU data. */ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { @@ -774,7 +832,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Move preemptable RCU's callbacks from dying CPU to other online CPU. + * Move preemptible RCU's callbacks from dying CPU to other online CPU. */ static void rcu_preempt_send_cbs_to_online(void) { @@ -782,7 +840,7 @@ static void rcu_preempt_send_cbs_to_online(void) } /* - * Initialize preemptable RCU's state structures. + * Initialize preemptible RCU's state structures. */ static void __init __rcu_init_preempt(void) { @@ -790,7 +848,7 @@ static void __init __rcu_init_preempt(void) } /* - * Check for a task exiting while in a preemptable-RCU read-side + * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep * is enabled. @@ -802,11 +860,13 @@ void exit_rcu(void) if (t->rcu_read_lock_nesting == 0) return; t->rcu_read_lock_nesting = 1; - rcu_read_unlock(); + __rcu_read_unlock(); } #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +static struct rcu_state *rcu_state = &rcu_sched_state; + /* * Tell them what RCU they are running. */ @@ -836,7 +896,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ static void rcu_preempt_note_context_switch(int cpu) @@ -844,10 +904,10 @@ static void rcu_preempt_note_context_switch(int cpu) } /* - * Because preemptable RCU does not exist, there are never any preempted + * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { return 0; } @@ -862,10 +922,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ static void rcu_print_detail_task_stall(struct rcu_state *rsp) @@ -873,7 +931,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) } /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ static void rcu_print_task_stall(struct rcu_node *rnp) @@ -888,10 +946,8 @@ static void rcu_preempt_stall_reset(void) { } -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* - * Because there is no preemptable RCU, there can be no readers blocked, + * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ @@ -903,7 +959,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU /* - * Because preemptable RCU does not exist, it never needs to migrate + * Because preemptible RCU does not exist, it never needs to migrate * tasks that were blocked within RCU read-side critical sections, and * such non-existent tasks cannot possibly have been blocking the current * grace period. @@ -916,7 +972,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, } /* - * Because preemptable RCU does not exist, it never needs CPU-offline + * Because preemptible RCU does not exist, it never needs CPU-offline * processing. */ static void rcu_preempt_offline_cpu(int cpu) @@ -926,7 +982,7 @@ static void rcu_preempt_offline_cpu(int cpu) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptable RCU does not exist, it never has any callbacks + * Because preemptible RCU does not exist, it never has any callbacks * to check. */ static void rcu_preempt_check_callbacks(int cpu) @@ -934,7 +990,7 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Because preemptable RCU does not exist, it never has any callbacks + * Because preemptible RCU does not exist, it never has any callbacks * to process. */ static void rcu_preempt_process_callbacks(void) @@ -943,7 +999,7 @@ static void rcu_preempt_process_callbacks(void) /* * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptable RCU does not exist, map to rcu-sched. + * But because preemptible RCU does not exist, map to rcu-sched. */ void synchronize_rcu_expedited(void) { @@ -954,7 +1010,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #ifdef CONFIG_HOTPLUG_CPU /* - * Because preemptable RCU does not exist, there is never any need to + * Because preemptible RCU does not exist, there is never any need to * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ @@ -966,7 +1022,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptable RCU does not exist, it never has any work to do. + * Because preemptible RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) { @@ -974,7 +1030,7 @@ static int rcu_preempt_pending(int cpu) } /* - * Because preemptable RCU does not exist, it never needs any CPU. + * Because preemptible RCU does not exist, it never needs any CPU. */ static int rcu_preempt_needs_cpu(int cpu) { @@ -982,7 +1038,7 @@ static int rcu_preempt_needs_cpu(int cpu) } /* - * Because preemptable RCU does not exist, rcu_barrier() is just + * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ void rcu_barrier(void) @@ -992,7 +1048,7 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Because preemptable RCU does not exist, there is no per-CPU + * Because preemptible RCU does not exist, there is no per-CPU * data to initialize. */ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) @@ -1000,14 +1056,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Because there is no preemptable RCU, there are no callbacks to move. + * Because there is no preemptible RCU, there are no callbacks to move. */ static void rcu_preempt_send_cbs_to_online(void) { } /* - * Because preemptable RCU does not exist, it need not be initialized. + * Because preemptible RCU does not exist, it need not be initialized. */ static void __init __rcu_init_preempt(void) { @@ -1015,6 +1071,276 @@ static void __init __rcu_init_preempt(void) #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + +#include "rtmutex_common.h" + +#ifdef CONFIG_RCU_TRACE + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ + if (list_empty(&rnp->blkd_tasks)) + rnp->n_balk_blkd_tasks++; + else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) + rnp->n_balk_exp_gp_tasks++; + else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) + rnp->n_balk_boost_tasks++; + else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) + rnp->n_balk_notblocked++; + else if (rnp->gp_tasks != NULL && + ULONG_CMP_LT(jiffies, rnp->boost_time)) + rnp->n_balk_notyet++; + else + rnp->n_balk_nos++; +} + +#else /* #ifdef CONFIG_RCU_TRACE */ + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* + * Carry out RCU priority boosting on the task indicated by ->exp_tasks + * or ->boost_tasks, advancing the pointer to the next task in the + * ->blkd_tasks list. + * + * Note that irqs must be enabled: boosting the task can block. + * Returns 1 if there are more tasks needing to be boosted. + */ +static int rcu_boost(struct rcu_node *rnp) +{ + unsigned long flags; + struct rt_mutex mtx; + struct task_struct *t; + struct list_head *tb; + + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + return 0; /* Nothing left to boost. */ + + raw_spin_lock_irqsave(&rnp->lock, flags); + + /* + * Recheck under the lock: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own. + */ + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rnp->exp_tasks != NULL) { + tb = rnp->exp_tasks; + rnp->n_exp_boosts++; + } else { + tb = rnp->boost_tasks; + rnp->n_normal_boosts++; + } + rnp->n_tasks_boosted++; + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + * + * Note that task t must acquire rnp->lock to remove itself from + * the ->blkd_tasks list, which it will do from exit() if from + * nowhere else. We therefore are guaranteed that task t will + * stay around at least until we drop rnp->lock. Note that + * rnp->lock also resolves races between our priority boosting + * and task t's exiting its outermost RCU read-side critical + * section. + */ + t = container_of(tb, struct task_struct, rcu_node_entry); + rt_mutex_init_proxy_locked(&mtx, t); + t->rcu_boost_mutex = &mtx; + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; +} + +/* + * Timer handler to initiate waking up of boost kthreads that + * have yielded the CPU due to excessive numbers of tasks to + * boost. We wake up the per-rcu_node kthread, which in turn + * will wake up the booster kthread. + */ +static void rcu_boost_kthread_timer(unsigned long arg) +{ + invoke_rcu_node_kthread((struct rcu_node *)arg); +} + +/* + * Priority-boosting kthread. One per leaf rcu_node and one for the + * root rcu_node. + */ +static int rcu_boost_kthread(void *arg) +{ + struct rcu_node *rnp = (struct rcu_node *)arg; + int spincnt = 0; + int more2boost; + + for (;;) { + rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks || + rnp->exp_tasks); + rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + more2boost = rcu_boost(rnp); + if (more2boost) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + spincnt = 0; + } + } + /* NOTREACHED */ + return 0; +} + +/* + * Check to see if it is time to start boosting RCU readers that are + * blocking the current grace period, and, if so, tell the per-rcu_node + * kthread to start boosting them. If there is an expedited grace + * period in progress, it is always time to boost. + * + * The caller must hold rnp->lock, which this function releases, + * but irqs remain disabled. The ->boost_kthread_task is immortal, + * so we don't need to worry about it going away. + */ +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + struct task_struct *t; + + if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + rnp->n_balk_exp_gp_tasks++; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rnp->exp_tasks != NULL || + (rnp->gp_tasks != NULL && + rnp->boost_tasks == NULL && + rnp->qsmask == 0 && + ULONG_CMP_GE(jiffies, rnp->boost_time))) { + if (rnp->exp_tasks == NULL) + rnp->boost_tasks = rnp->gp_tasks; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + t = rnp->boost_kthread_task; + if (t != NULL) + wake_up_process(t); + } else { + rcu_initiate_boost_trace(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* + * Set the affinity of the boost kthread. The CPU-hotplug locks are + * held, so no one should be messing with the existence of the boost + * kthread. + */ +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm) +{ + struct task_struct *t; + + t = rnp->boost_kthread_task; + if (t != NULL) + set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); +} + +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) + +/* + * Do priority-boost accounting for the start of a new grace period. + */ +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ + rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; +} + +/* + * Initialize the RCU-boost waitqueue. + */ +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) +{ + init_waitqueue_head(&rnp->boost_wq); +} + +/* + * Create an RCU-boost kthread for the specified node if one does not + * already exist. We only create this kthread for preemptible RCU. + * Returns zero if all is well, a negated errno otherwise. + */ +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index) +{ + unsigned long flags; + struct sched_param sp; + struct task_struct *t; + + if (&rcu_preempt_state != rsp) + return 0; + if (rnp->boost_kthread_task != NULL) + return 0; + t = kthread_create(rcu_boost_kthread, (void *)rnp, + "rcub%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->boost_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm) +{ +} + +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ +} + +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) +{ +} + +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index) +{ + return 0; +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + #ifndef CONFIG_SMP void synchronize_sched_expedited(void) @@ -1187,8 +1513,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a - * raise_softirq() to cause rcu_process_callbacks() to be invoked later. - * The per-cpu rcu_dyntick_drain variable controls the sequencing. + * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked + * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. */ int rcu_needs_cpu(int cpu) { @@ -1239,7 +1565,7 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_cpu_kthread(); return c; } |