diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup/dmem.c | 70 | ||||
| -rw-r--r-- | kernel/dma/contiguous.c | 16 | ||||
| -rw-r--r-- | kernel/dma/pool.c | 7 | ||||
| -rw-r--r-- | kernel/events/callchain.c | 2 | ||||
| -rw-r--r-- | kernel/events/core.c | 6 | ||||
| -rw-r--r-- | kernel/liveupdate/kexec_handover.c | 12 | ||||
| -rw-r--r-- | kernel/liveupdate/luo_file.c | 2 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 12 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 48 | ||||
| -rw-r--r-- | kernel/vmcore_info.c | 6 |
10 files changed, 159 insertions, 22 deletions
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index e12b946278b6..1ea6afffa985 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/page_counter.h> #include <linux/parser.h> +#include <linux/refcount.h> #include <linux/rculist.h> #include <linux/slab.h> @@ -71,7 +72,9 @@ struct dmem_cgroup_pool_state { struct rcu_head rcu; struct page_counter cnt; + struct dmem_cgroup_pool_state *parent; + refcount_t ref; bool inited; }; @@ -88,6 +91,9 @@ struct dmem_cgroup_pool_state { static DEFINE_SPINLOCK(dmemcg_lock); static LIST_HEAD(dmem_cgroup_regions); +static void dmemcg_free_region(struct kref *ref); +static void dmemcg_pool_free_rcu(struct rcu_head *rcu); + static inline struct dmemcg_state * css_to_dmemcs(struct cgroup_subsys_state *css) { @@ -104,10 +110,38 @@ static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg) return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL; } +static void dmemcg_pool_get(struct dmem_cgroup_pool_state *pool) +{ + refcount_inc(&pool->ref); +} + +static bool dmemcg_pool_tryget(struct dmem_cgroup_pool_state *pool) +{ + return refcount_inc_not_zero(&pool->ref); +} + +static void dmemcg_pool_put(struct dmem_cgroup_pool_state *pool) +{ + if (!refcount_dec_and_test(&pool->ref)) + return; + + call_rcu(&pool->rcu, dmemcg_pool_free_rcu); +} + +static void dmemcg_pool_free_rcu(struct rcu_head *rcu) +{ + struct dmem_cgroup_pool_state *pool = container_of(rcu, typeof(*pool), rcu); + + if (pool->parent) + dmemcg_pool_put(pool->parent); + kref_put(&pool->region->ref, dmemcg_free_region); + kfree(pool); +} + static void free_cg_pool(struct dmem_cgroup_pool_state *pool) { list_del(&pool->region_node); - kfree(pool); + dmemcg_pool_put(pool); } static void @@ -342,6 +376,12 @@ alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region page_counter_init(&pool->cnt, ppool ? &ppool->cnt : NULL, true); reset_all_resource_limits(pool); + refcount_set(&pool->ref, 1); + kref_get(®ion->ref); + if (ppool && !pool->parent) { + pool->parent = ppool; + dmemcg_pool_get(ppool); + } list_add_tail_rcu(&pool->css_node, &dmemcs->pools); list_add_tail(&pool->region_node, ®ion->pools); @@ -389,6 +429,10 @@ get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *regio /* Fix up parent links, mark as inited. */ pool->cnt.parent = &ppool->cnt; + if (ppool && !pool->parent) { + pool->parent = ppool; + dmemcg_pool_get(ppool); + } pool->inited = true; pool = ppool; @@ -423,7 +467,7 @@ static void dmemcg_free_region(struct kref *ref) */ void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) { - struct list_head *entry; + struct dmem_cgroup_pool_state *pool, *next; if (!region) return; @@ -433,11 +477,10 @@ void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) /* Remove from global region list */ list_del_rcu(®ion->region_node); - list_for_each_rcu(entry, ®ion->pools) { - struct dmem_cgroup_pool_state *pool = - container_of(entry, typeof(*pool), region_node); - + list_for_each_entry_safe(pool, next, ®ion->pools, region_node) { list_del_rcu(&pool->css_node); + list_del(&pool->region_node); + dmemcg_pool_put(pool); } /* @@ -518,8 +561,10 @@ static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name) */ void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) { - if (pool) + if (pool) { css_put(&pool->cs->css); + dmemcg_pool_put(pool); + } } EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); @@ -533,6 +578,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = find_cg_pool_locked(cg, region); if (pool && !READ_ONCE(pool->inited)) pool = NULL; + if (pool && !dmemcg_pool_tryget(pool)) + pool = NULL; rcu_read_unlock(); while (!pool) { @@ -541,6 +588,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = get_cg_pool_locked(cg, region, &allocpool); else pool = ERR_PTR(-ENODEV); + if (!IS_ERR(pool)) + dmemcg_pool_get(pool); spin_unlock(&dmemcg_lock); if (pool == ERR_PTR(-ENOMEM)) { @@ -576,6 +625,7 @@ void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) page_counter_uncharge(&pool->cnt, size); css_put(&pool->cs->css); + dmemcg_pool_put(pool); } EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge); @@ -627,7 +677,9 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, if (ret_limit_pool) { *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt); css_get(&(*ret_limit_pool)->cs->css); + dmemcg_pool_get(*ret_limit_pool); } + dmemcg_pool_put(pool); ret = -EAGAIN; goto err; } @@ -700,6 +752,9 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, if (!region_name[0]) continue; + if (!options || !*options) + return -EINVAL; + rcu_read_lock(); region = dmemcg_get_region_by_name(region_name); rcu_read_unlock(); @@ -719,6 +774,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, /* And commit */ apply(pool, new_limit); + dmemcg_pool_put(pool); out_put: kref_put(®ion->ref, dmemcg_free_region); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index d8fd6f779f79..0e266979728b 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -91,6 +91,16 @@ static int __init early_cma(char *p) } early_param("cma", early_cma); +/* + * cma_skip_dt_default_reserved_mem - This is called from the + * reserved_mem framework to detect if the default cma region is being + * set by the "cma=" kernel parameter. + */ +bool __init cma_skip_dt_default_reserved_mem(void) +{ + return size_cmdline != -1; +} + #ifdef CONFIG_DMA_NUMA_CMA static struct cma *dma_contiguous_numa_area[MAX_NUMNODES]; @@ -470,12 +480,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) struct cma *cma; int err; - if (size_cmdline != -1 && default_cma) { - pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n", - rmem->name); - return -EBUSY; - } - if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index c5da29ad010c..2b2fbb709242 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -277,15 +277,20 @@ struct page *dma_alloc_from_pool(struct device *dev, size_t size, { struct gen_pool *pool = NULL; struct page *page; + bool pool_found = false; while ((pool = dma_guess_pool(pool, gfp))) { + pool_found = true; page = __dma_alloc_from_pool(dev, size, pool, cpu_addr, phys_addr_ok); if (page) return page; } - WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev)); + if (pool_found) + WARN(!(gfp & __GFP_NOWARN), "DMA pool exhausted for %s\n", dev_name(dev)); + else + WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev)); return NULL; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 1f6589578703..9d24b6e0c91f 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -246,7 +246,7 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, if (user && !crosstask) { if (!user_mode(regs)) { - if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + if (!is_user_task(current)) goto exit_put; regs = task_pt_regs(current); } diff --git a/kernel/events/core.c b/kernel/events/core.c index a0fa488bce84..8cca80094624 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7460,7 +7460,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + } else if (is_user_task(current)) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -8100,7 +8100,7 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ - if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + if (is_user_task(current)) { struct page *p; pagefault_disable(); @@ -8215,7 +8215,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; bool user = !event->attr.exclude_callchain_user && - !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); + is_user_task(current); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index d4482b6e3cae..90d411a59f76 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -255,6 +255,14 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) if (is_folio && info.order) prep_compound_page(page, info.order); + /* Always mark headpage's codetag as empty to avoid accounting mismatch */ + clear_page_tag_ref(page); + if (!is_folio) { + /* Also do that for the non-compound tail pages */ + for (unsigned int i = 1; i < nr_pages; i++) + clear_page_tag_ref(page + i); + } + adjust_managed_page_count(page, nr_pages); return page; } @@ -1006,8 +1014,10 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) chunk->phys[idx++] = phys; if (idx == ARRAY_SIZE(chunk->phys)) { chunk = new_vmalloc_chunk(chunk); - if (!chunk) + if (!chunk) { + err = -ENOMEM; goto err_free; + } idx = 0; } } diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index a32a777f6df8..9f7283379ebc 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -402,8 +402,6 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set, luo_file->fh->ops->unfreeze(&args); } - - luo_file->serialized_data = 0; } static void __luo_file_unfreeze(struct luo_file_set *file_set, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c509f2e7d69d..7bcde7114f1b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1034,6 +1034,12 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) return; } + /* + * When [4] D->A is followed by [1] A->B, dl_defer_running + * needs to be cleared, otherwise it will fail to properly + * start the zero-laxity timer. + */ + dl_se->dl_defer_running = 0; replenish_dl_new_period(dl_se, rq); } else if (dl_server(dl_se) && dl_se->dl_defer) { /* @@ -1655,6 +1661,12 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) * dl_server_active = 1; * enqueue_dl_entity() * update_dl_entity(WAKEUP) + * if (dl_time_before() || dl_entity_overflow()) + * dl_defer_running = 0; + * replenish_dl_new_period(); + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; * if (!dl_defer_running) * dl_defer_armed = 1; * dl_throttled = 1; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index afe28c04d5aa..0bb8fa927e9e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -194,6 +194,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); +static bool task_dead_and_done(struct task_struct *p); static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, @@ -2619,6 +2620,9 @@ static void set_cpus_allowed_scx(struct task_struct *p, set_cpus_allowed_common(p, ac); + if (task_dead_and_done(p)) + return; + /* * The effective cpumask is stored in @p->cpus_ptr which may temporarily * differ from the configured one in @p->cpus_mask. Always tell the bpf @@ -3034,10 +3038,45 @@ void scx_cancel_fork(struct task_struct *p) percpu_up_read(&scx_fork_rwsem); } +/** + * task_dead_and_done - Is a task dead and done running? + * @p: target task + * + * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the + * task no longer exists from SCX's POV. However, certain sched_class ops may be + * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() + * may try to switch a task which finished sched_ext_dead() back into SCX + * triggering invalid SCX task state transitions and worse. + * + * Once a task has finished the final switch, sched_ext_dead() is the only thing + * that needs to happen on the task. Use this test to short-circuit sched_class + * operations which may be called on dead tasks. + */ +static bool task_dead_and_done(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); + + /* + * In do_task_dead(), a dying task sets %TASK_DEAD with preemption + * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p + * won't ever run again. + */ + return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && + !task_on_cpu(rq, p); +} + void sched_ext_dead(struct task_struct *p) { unsigned long flags; + /* + * By the time control reaches here, @p has %TASK_DEAD set, switched out + * for the last time and then dropped the rq lock - task_dead_and_done() + * should be returning %true nullifying the straggling sched_class ops. + * Remove from scx_tasks and exit @p. + */ raw_spin_lock_irqsave(&scx_tasks_lock, flags); list_del_init(&p->scx.tasks_node); raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); @@ -3063,6 +3102,9 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, lockdep_assert_rq_held(task_rq(p)); + if (task_dead_and_done(p)) + return; + p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); if (SCX_HAS_OP(sch, set_weight)) SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, @@ -3077,6 +3119,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p) { struct scx_sched *sch = scx_root; + if (task_dead_and_done(p)) + return; + scx_enable_task(p); /* @@ -3090,6 +3135,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p) static void switched_from_scx(struct rq *rq, struct task_struct *p) { + if (task_dead_and_done(p)) + return; + scx_disable_task(p); } diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index fe9bf8db1922..e2784038bbed 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -36,7 +36,11 @@ struct hwerr_info { time64_t timestamp; }; -static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; +/* + * The hwerr_data[] array is declared with global scope so that it remains + * accessible to vmcoreinfo even when Link Time Optimization (LTO) is enabled. + */ +struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) |
