diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup/rstat.c | 13 | ||||
| -rw-r--r-- | kernel/cpu.c | 25 | ||||
| -rw-r--r-- | kernel/dma/pool.c | 2 | ||||
| -rw-r--r-- | kernel/events/core.c | 22 | ||||
| -rw-r--r-- | kernel/events/uprobes.c | 8 | ||||
| -rw-r--r-- | kernel/irq/irqdesc.c | 6 | ||||
| -rw-r--r-- | kernel/irq/manage.c | 3 | ||||
| -rw-r--r-- | kernel/liveupdate/Kconfig | 1 | ||||
| -rw-r--r-- | kernel/liveupdate/luo_core.c | 4 | ||||
| -rw-r--r-- | kernel/liveupdate/luo_file.c | 7 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 72 |
11 files changed, 102 insertions, 61 deletions
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index a198e40c799b..150e5871e66f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -71,7 +71,6 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct llist_head *lhead; struct css_rstat_cpu *rstatc; - struct css_rstat_cpu __percpu *rstatc_pcpu; struct llist_node *self; /* @@ -104,18 +103,22 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) /* * This function can be renentered by irqs and nmis for the same cgroup * and may try to insert the same per-cpu lnode into the llist. Note - * that llist_add() does not protect against such scenarios. + * that llist_add() does not protect against such scenarios. In addition + * this same per-cpu lnode can be modified through init_llist_node() + * from css_rstat_flush() running on a different CPU. * * To protect against such stacked contexts of irqs/nmis, we use the * fact that lnode points to itself when not on a list and then use - * this_cpu_cmpxchg() to atomically set to NULL to select the winner + * try_cmpxchg() to atomically set to NULL to select the winner * which will call llist_add(). The losers can assume the insertion is * successful and the winner will eventually add the per-cpu lnode to * the llist. + * + * Please note that we can not use this_cpu_cmpxchg() here as on some + * archs it is not safe against modifications from multiple CPUs. */ self = &rstatc->lnode; - rstatc_pcpu = css->rstat_cpu; - if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) + if (!try_cmpxchg(&rstatc->lnode.next, &self, NULL)) return; lhead = ss_lhead_cpu(css->ss, cpu); diff --git a/kernel/cpu.c b/kernel/cpu.c index b674fdf96208..8df2d773fe3b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -249,6 +249,14 @@ err: return ret; } +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + #ifdef CONFIG_SMP static bool cpuhp_is_ap_state(enum cpuhp_state state) { @@ -271,14 +279,6 @@ static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) complete(done); } -/* - * The former STARTING/DYING states, ran with IRQs disabled and must not fail. - */ -static bool cpuhp_is_atomic_state(enum cpuhp_state state) -{ - return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; -} - /* Synchronization state management */ enum cpuhp_sync_state { SYNC_STATE_DEAD, @@ -2364,7 +2364,14 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, else ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else - ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + if (cpuhp_is_atomic_state(state)) { + guard(irqsave)(); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + /* STARTING/DYING must not fail! */ + WARN_ON_ONCE(ret); + } else { + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + } #endif BUG_ON(ret && !bringup); return ret; diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index ee45dee33d49..26392badc36b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -93,7 +93,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, page = dma_alloc_from_contiguous(NULL, 1 << order, order, false); if (!page) - page = alloc_pages(gfp, order); + page = alloc_pages(gfp | __GFP_NOWARN, order); } while (!page && order-- > 0); if (!page) goto out; diff --git a/kernel/events/core.c b/kernel/events/core.c index ece716879cbc..dad0d3d2e85f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2317,8 +2317,6 @@ out: perf_event__header_size(leader); } -static void sync_child_event(struct perf_event *child_event); - static void perf_child_detach(struct perf_event *event) { struct perf_event *parent_event = event->parent; @@ -2337,7 +2335,6 @@ static void perf_child_detach(struct perf_event *event) lockdep_assert_held(&parent_event->child_mutex); */ - sync_child_event(event); list_del_init(&event->child_list); } @@ -4588,6 +4585,7 @@ out: static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx, + struct task_struct *task, bool revoke); /* @@ -4615,7 +4613,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx, false); + perf_event_exit_event(event, ctx, ctx->task, false); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -12518,7 +12516,7 @@ static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, /* * De-schedule the event and mark it REVOKED. */ - perf_event_exit_event(event, ctx, true); + perf_event_exit_event(event, ctx, ctx->task, true); /* * All _free_event() bits that rely on event->pmu: @@ -14075,14 +14073,13 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); -static void sync_child_event(struct perf_event *child_event) +static void sync_child_event(struct perf_event *child_event, + struct task_struct *task) { struct perf_event *parent_event = child_event->parent; u64 child_val; if (child_event->attr.inherit_stat) { - struct task_struct *task = child_event->ctx->task; - if (task && task != TASK_TOMBSTONE) perf_event_read_event(child_event, task); } @@ -14101,7 +14098,9 @@ static void sync_child_event(struct perf_event *child_event) static void perf_event_exit_event(struct perf_event *event, - struct perf_event_context *ctx, bool revoke) + struct perf_event_context *ctx, + struct task_struct *task, + bool revoke) { struct perf_event *parent_event = event->parent; unsigned long detach_flags = DETACH_EXIT; @@ -14124,6 +14123,9 @@ perf_event_exit_event(struct perf_event *event, mutex_lock(&parent_event->child_mutex); /* PERF_ATTACH_ITRACE might be set concurrently */ attach_state = READ_ONCE(event->attach_state); + + if (attach_state & PERF_ATTACH_CHILD) + sync_child_event(event, task); } if (revoke) @@ -14215,7 +14217,7 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit) perf_event_task(task, ctx, 0); list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) - perf_event_exit_event(child_event, ctx, false); + perf_event_exit_event(child_event, ctx, exit ? task : NULL, false); mutex_unlock(&ctx->mutex); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f11ceb8be8c4..d546d32390a8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -79,7 +79,7 @@ struct uprobe { * The generic code assumes that it has two members of unknown type * owned by the arch-specific code: * - * insn - copy_insn() saves the original instruction here for + * insn - copy_insn() saves the original instruction here for * arch_uprobe_analyze_insn(). * * ixol - potentially modified instruction to execute out of @@ -107,8 +107,8 @@ static LIST_HEAD(delayed_uprobe_list); * allocated. */ struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - unsigned long *bitmap; /* 0 = free slot */ + wait_queue_head_t wq; /* if all slots are busy */ + unsigned long *bitmap; /* 0 = free slot */ struct page *page; /* @@ -116,7 +116,7 @@ struct xol_area { * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ - unsigned long vaddr; /* Page(s) of instruction slots */ + unsigned long vaddr; /* Page(s) of instruction slots */ }; static void uprobe_warn(struct task_struct *t, const char *msg) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6acf268f005b..f8e4e13dbe33 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -720,7 +720,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe); * This function must be called from an IRQ context with irq regs * initialized. */ -int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq) { return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } @@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq); * context). If the interrupt is marked as 'enforce IRQ-context only' then * the function must be invoked from hard interrupt context. */ -int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned long flags; int ret; @@ -761,7 +761,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); * This function must be called from an NMI context with irq regs * initialized. **/ -int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq) { WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0bb29316b436..8b1b4c8a4f54 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2470,6 +2470,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; + if (!act->affinity) + act->affinity = cpu_online_mask; + retval = __setup_irq(irq, desc, act); if (retval) diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig index 9b2515f31afb..d2aeaf13c3ac 100644 --- a/kernel/liveupdate/Kconfig +++ b/kernel/liveupdate/Kconfig @@ -54,6 +54,7 @@ config KEXEC_HANDOVER_ENABLE_DEFAULT config LIVEUPDATE bool "Live Update Orchestrator" depends on KEXEC_HANDOVER + depends on SHMEM help Enable the Live Update Orchestrator. Live Update is a mechanism, typically based on kexec, that allows the kernel to be updated diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index f7ecaf7740d1..944663d99dd9 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -399,10 +399,8 @@ static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) int err; nr = _IOC_NR(cmd); - if (nr < LIVEUPDATE_CMD_BASE || - (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) { + if (nr - LIVEUPDATE_CMD_BASE >= ARRAY_SIZE(luo_ioctl_ops)) return -EINVAL; - } ucmd.ubuffer = (void __user *)arg; err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index ddff87917b21..a32a777f6df8 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -554,17 +554,20 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, { struct liveupdate_file_op_args args = {0}; struct luo_file *luo_file; + bool found = false; int err; if (list_empty(&file_set->files_list)) return -ENOENT; list_for_each_entry(luo_file, &file_set->files_list, list) { - if (luo_file->token == token) + if (luo_file->token == token) { + found = true; break; + } } - if (luo_file->token != token) + if (!found) return -ENOENT; guard(mutex)(&luo_file->mutex); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 05f5a49e9649..94164f2dec6d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -41,6 +41,13 @@ static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); +/* + * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass + * depth on enable failure. Will be removed when bypass depth is moved into the + * sched instance. + */ +static bool scx_bypassed_for_enable; + static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); @@ -975,6 +982,30 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } +static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) +{ + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + /* + * If @rq is in balance, the CPU is already vacant and looking for the + * next task to run. No need to preempt or trigger resched after moving + * @p into its local DSQ. + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) + return; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } + + if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class)) + resched_curr(rq); +} + static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { @@ -1086,22 +1117,10 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, if (enq_flags & SCX_ENQ_CLEAR_OPSS) atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); - if (is_local) { - struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); - bool preempt = false; - - if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && - rq->curr->sched_class == &ext_sched_class) { - rq->curr->scx.slice = 0; - preempt = true; - } - - if (preempt || sched_class_above(&ext_sched_class, - rq->curr->sched_class)) - resched_curr(rq); - } else { + if (is_local) + local_dsq_post_enq(dsq, p, enq_flags); + else raw_spin_unlock(&dsq->lock); - } } static void task_unlink_from_dsq(struct task_struct *p, @@ -1625,6 +1644,8 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, dsq_mod_nr(dst_dsq, 1); p->scx.dsq = dst_dsq; + + local_dsq_post_enq(dst_dsq, p, enq_flags); } /** @@ -2402,7 +2423,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * ops.enqueue() that @p is the only one available for this cpu, * which should trigger an explicit follow-up scheduling event. */ - if (sched_class_above(&ext_sched_class, next->sched_class)) { + if (next && sched_class_above(&ext_sched_class, next->sched_class)) { WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); } else { @@ -2425,7 +2446,7 @@ static struct task_struct * do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) { struct task_struct *prev = rq->curr; - bool keep_prev, kick_idle = false; + bool keep_prev; struct task_struct *p; /* see kick_cpus_irq_workfn() */ @@ -2467,12 +2488,8 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); } else { p = first_local_task(rq); - if (!p) { - if (kick_idle) - scx_kick_cpu(rcu_dereference_sched(scx_root), - cpu_of(rq), SCX_KICK_IDLE); + if (!p) return NULL; - } if (unlikely(!p->scx.slice)) { struct scx_sched *sch = rcu_dereference_sched(scx_root); @@ -3575,7 +3592,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) int node; irq_work_sync(&sch->error_irq_work); - kthread_stop(sch->helper->task); + kthread_destroy_worker(sch->helper); free_percpu(sch->pcpu); @@ -4318,6 +4335,11 @@ static void scx_disable_workfn(struct kthread_work *work) scx_dsp_max_batch = 0; free_kick_syncs(); + if (scx_bypassed_for_enable) { + scx_bypassed_for_enable = false; + scx_bypass(false); + } + mutex_unlock(&scx_enable_mutex); WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); @@ -4786,7 +4808,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) return sch; err_stop_helper: - kthread_stop(sch->helper->task); + kthread_destroy_worker(sch->helper); err_free_pcpu: free_percpu(sch->pcpu); err_free_gdsqs: @@ -4970,6 +4992,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) * Init in bypass mode to guarantee forward progress. */ scx_bypass(true); + scx_bypassed_for_enable = true; for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) @@ -5067,6 +5090,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); + scx_bypassed_for_enable = false; scx_bypass(false); if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { |
