From 0e18dd12064e07519f7cbff4149ca7fff620cbed Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 15 Nov 2017 08:47:02 +0300 Subject: perf/core: Fix memory leak triggered by perf --namespace perf with --namespace key leaks various memory objects including namespaces 4.14.0+ pid_namespace 1 12 2568 12 8 user_namespace 1 39 824 39 8 net_namespace 1 5 6272 5 8 This happen because perf_fill_ns_link_info() struct patch ns_path: during initialization ns_path incremented counters on related mnt and dentry, but without lost path_put nobody decremented them back. Leaked dentry is name of related namespace, and its leak does not allow to free unused namespace. Signed-off-by: Vasily Averin Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Hari Bathini Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Thomas Gleixner Fixes: commit e422267322cd ("perf: Add PERF_RECORD_NAMESPACES to include namespaces related info") Link: http://lkml.kernel.org/r/c510711b-3904-e5e1-d296-61273d21118d@virtuozzo.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 10cdb9c26b5d..ab5ac84f82e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6756,6 +6756,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; + path_put(&ns_path); } } -- cgit v1.2.3 From 1690102de5651bb85b23d5eeaff682a6b96d705b Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Sun, 19 Nov 2017 16:48:13 -0200 Subject: blktrace: Use blk_trace_bio_get_cgid inside blk_add_trace_bio We always pass in blk_trace_bio_get_cgid(q, bio) to blk_add_trace_bio(). Since both are readily available in the function already, kill the argument. Signed-off-by: Marcos Paulo de Souza Rewrote commit message. Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 206e0e2ace53..c5987d4c5f23 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -872,7 +872,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error, union kernfs_node_id *cgid) + u32 what, int error) { struct blk_trace *bt = q->blk_trace; @@ -880,22 +880,21 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); + bio_op(bio), bio->bi_opf, what, error, 0, NULL, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio, int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } static void blk_add_trace_bio_backmerge(void *ignore, @@ -903,8 +902,7 @@ static void blk_add_trace_bio_backmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); } static void blk_add_trace_bio_frontmerge(void *ignore, @@ -912,15 +910,13 @@ static void blk_add_trace_bio_frontmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, @@ -928,8 +924,7 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { struct blk_trace *bt = q->blk_trace; @@ -945,8 +940,7 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { struct blk_trace *bt = q->blk_trace; -- cgit v1.2.3 From aa24163b2ee5c92120e32e99b5a93143a0f4258e Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Wed, 15 Nov 2017 19:50:14 +0530 Subject: cgroup/cpuset: remove circular dependency deadlock Remove circular dependency deadlock in a scenario where hotplug of CPU is being done while there is updation in cgroup and cpuset triggered from userspace. Process A => kthreadd => Process B => Process C => Process A Process A cpu_subsys_offline(); cpu_down(); _cpu_down(); percpu_down_write(&cpu_hotplug_lock); //held cpuhp_invoke_callback(); workqueue_offline_cpu(); queue_work_on(); // unbind_work on system_highpri_wq __queue_work(); insert_work(); wake_up_worker(); flush_work(); wait_for_completion(); worker_thread(); manage_workers(); create_worker(); kthread_create_on_node(); wake_up_process(kthreadd_task); kthreadd kthreadd(); kernel_thread(); do_fork(); copy_process(); percpu_down_read(&cgroup_threadgroup_rwsem); __rwsem_down_read_failed_common(); //waiting Process B kernfs_fop_write(); cgroup_file_write(); cgroup_procs_write(); percpu_down_write(&cgroup_threadgroup_rwsem); //held cgroup_attach_task(); cgroup_migrate(); cgroup_migrate_execute(); cpuset_can_attach(); mutex_lock(&cpuset_mutex); //waiting Process C kernfs_fop_write(); cgroup_file_write(); cpuset_write_resmask(); mutex_lock(&cpuset_mutex); //held update_cpumask(); update_cpumasks_hier(); rebuild_sched_domains_locked(); get_online_cpus(); percpu_down_read(&cpu_hotplug_lock); //waiting Eliminating deadlock by reversing the locking order for cpuset_mutex and cpu_hotplug_lock. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 53 ++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f7efa7b4d825..cab5fd1ee767 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -812,6 +812,18 @@ done: return ndoms; } +static void cpuset_sched_change_begin(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +static void cpuset_sched_change_end(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + /* * Rebuild scheduler domains. * @@ -821,16 +833,14 @@ done: * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -838,27 +848,25 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_begin(); + rebuild_sched_domains_cpuslocked(); + cpuset_sched_change_end(); } /** @@ -944,7 +952,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } /** @@ -1276,7 +1284,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } return 0; @@ -1309,7 +1317,6 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ - static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1342,7 +1349,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1610,7 +1617,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1646,7 +1653,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1657,7 +1664,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1670,7 +1677,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1709,7 +1716,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1733,7 +1740,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2034,14 +2041,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_cpuslocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2049,7 +2056,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 1599a185f0e6113be185b9fb809c621c73865829 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Wed, 15 Nov 2017 19:50:15 +0530 Subject: cpuset: Make cpuset hotplug synchronous Convert cpuset_hotplug_workfn() into synchronous call for cpu hotplug path. For memory hotplug path it still gets queued as a work item. Since cpuset_hotplug_workfn() can be made synchronous for cpu hotplug path, it is not required to wait for cpuset hotplug while thawing processes. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 41 ++++++++++++++++++++--------------------- kernel/power/process.c | 2 -- kernel/sched/core.c | 1 - 3 files changed, 20 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cab5fd1ee767..227bc25d951d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2277,15 +2277,8 @@ retry: mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2300,7 +2293,7 @@ void cpuset_force_rebuild(void) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug(bool use_cpu_hp_lock) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2358,25 +2351,31 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); + if (cpus_updated) { + if (use_cpu_hp_lock) + rebuild_sched_domains(); + else { + /* Acquiring cpu_hotplug_lock is not required. + * When cpuset_hotplug() is called in hotplug path, + * cpu_hotplug_lock is held by the hotplug context + * which is waiting for cpuhp_thread_fun to indicate + * completion of callback. + */ + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_cpuslocked(); + mutex_unlock(&cpuset_mutex); + } } } -void cpuset_update_active_cpus(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); + cpuset_hotplug(true); } -void cpuset_wait_for_hotplug(void) +void cpuset_update_active_cpus(void) { - flush_work(&cpuset_hotplug_work); + cpuset_hotplug(false); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 7381d49a44db..c326d7235c5f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -204,8 +204,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..88b3450b29ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5624,7 +5624,6 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ - cpuset_force_rebuild(); } cpuset_update_active_cpus(); } -- cgit v1.2.3 From c98a9805096460567404799a7bd3149826affde7 Mon Sep 17 00:00:00 2001 From: Tal Shorer Date: Fri, 3 Nov 2017 17:27:50 +0200 Subject: workqueue: respect isolated cpus when queueing an unbound work Initialize wq_unbound_cpumask to exclude cpus that were isolated by the cmdline's isolcpus parameter. Signed-off-by: Tal Shorer Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fdb710bfdd7..6a5658cb46da 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -4957,6 +4958,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5555,7 +5560,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.2.3 From 2967acbb257a6a9bf912f4778b727e00972eac9b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 19 Nov 2017 11:52:55 -0700 Subject: blktrace: fix trace mutex deadlock A previous commit changed the locking around registration/cleanup, but direct callers of blk_trace_remove() were missed. This means that if we hit the error path in setup, we will deadlock on attempting to re-acquire the queue trace mutex. Fixes: 1f2cac107c59 ("blktrace: fix unlocked access to init/start-stop/teardown") Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c5987d4c5f23..987d9a9ae283 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -591,7 +591,7 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { - blk_trace_remove(q); + __blk_trace_remove(q); return -EFAULT; } return 0; @@ -637,7 +637,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { - blk_trace_remove(q); + __blk_trace_remove(q); return -EFAULT; } -- cgit v1.2.3 From ddf7005f32212f28669032651e09bd8d2245c35d Mon Sep 17 00:00:00 2001 From: Wang Long Date: Sun, 19 Nov 2017 16:08:37 -0500 Subject: debug cgroup: use task_css_set instead of rcu_dereference This macro `task_css_set` verifies that the caller is inside proper critical section if the kernel set CONFIG_PROVE_RCU=y. Signed-off-by: Wang Long Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f780d8f6a9d..9caeda610249 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); if (refcnt > cset->nr_tasks) @@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; -- cgit v1.2.3 From a39e17b2d842938e19997d2fdc0443fdd4cd8d10 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Nov 2017 12:10:23 -0800 Subject: bpf: offload: add a license header I forgot to add a license on kernel/bpf/offload.c. Luckily I'm still the only author so make it explicitly GPLv2. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 68ec884440b7..8455b89d1bbf 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree. + * + * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE + * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME + * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + */ + #include #include #include -- cgit v1.2.3 From 46febd37f9c758b05cd25feae8512f22584742fe Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Nov 2017 21:19:53 +0800 Subject: smp/hotplug: Move step CPUHP_AP_SMPCFD_DYING to the correct place Commit 31487f8328f2 ("smp/cfd: Convert core to hotplug state machine") accidently put this step on the wrong place. The step should be at the cpuhp_ap_states[] rather than the cpuhp_bp_states[]. grep smpcfd /sys/devices/system/cpu/hotplug/states 40: smpcfd:prepare 129: smpcfd:dying "smpcfd:dying" was missing before. So was the invocation of the function smpcfd_dying_cpu(). Fixes: 31487f8328f2 ("smp/cfd: Convert core to hotplug state machine") Signed-off-by: Lai Jiangshan Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Cc: Boris Ostrovsky Link: https://lkml.kernel.org/r/20171128131954.81229-1-jiangshanlai@gmail.com --- kernel/cpu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 04892a82f6ac..7891aecc6aec 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1289,11 +1289,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - [CPUHP_AP_SMPCFD_DYING] = { - .name = "smpcfd:dying", - .startup.single = NULL, - .teardown.single = smpcfd_dying_cpu, - }, /* * Handled on controll processor until the plugged processor manages * this itself. @@ -1335,6 +1330,11 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup.single = NULL, .teardown.single = rcutree_dying_cpu, }, + [CPUHP_AP_SMPCFD_DYING] = { + .name = "smpcfd:dying", + .startup.single = NULL, + .teardown.single = smpcfd_dying_cpu, + }, /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { -- cgit v1.2.3 From 52cf373c37a684f8fc279d541307fad39d206376 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Tue, 28 Nov 2017 13:59:25 +0100 Subject: cgroup: properly init u64_stats Lockdep complains that the stats update is trying to register a non-static key. This is because u64_stats are using a seqlock on 32bit arches, which needs to be initialized before usage. Fixes: 041cd640b2f3 (cgroup: Implement cgroup2 basic CPU usage accounting) Signed-off-by: Lucas Stach Signed-off-by: Tejun Heo --- kernel/cgroup/stat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 133b465691d6..1e111dd455c4 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c @@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp) } /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) - cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + cstat->updated_children = cgrp; + u64_stats_init(&cstat->sync); + } prev_cputime_init(&cgrp->stat.prev_cputime); -- cgit v1.2.3 From 34900ec5c9577cc1b0f22887ac7349f458ba8ac2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 9 Aug 2017 18:14:06 +0200 Subject: perf: Fix header.size for namespace events Reset header size for namespace events, otherwise it only gets bigger in ctx iterations. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Fixes: e422267322cd ("perf: Add PERF_RECORD_NAMESPACES to include namespaces related info") Link: http://lkml.kernel.org/n/tip-nlo4gonz9d4guyb8153ukzt0@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4c39c05e029a..799bb352d99f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6640,6 +6640,7 @@ static void perf_event_namespaces_output(struct perf_event *event, struct perf_namespaces_event *namespaces_event = data; struct perf_output_handle handle; struct perf_sample_data sample; + u16 header_size = namespaces_event->event_id.header.size; int ret; if (!perf_event_namespaces_match(event)) @@ -6650,7 +6651,7 @@ static void perf_event_namespaces_output(struct perf_event *event, ret = perf_output_begin(&handle, event, namespaces_event->event_id.header.size); if (ret) - return; + goto out; namespaces_event->event_id.pid = perf_event_pid(event, namespaces_event->task); @@ -6662,6 +6663,8 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); +out: + namespaces_event->event_id.header.size = header_size; } static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, -- cgit v1.2.3 From 668533dc0764b30c9dd2baf3ca800156f688326b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 29 Nov 2017 10:30:13 -0800 Subject: kallsyms: take advantage of the new '%px' format The conditional kallsym hex printing used a special fixed-width '%lx' output (KALLSYM_FMT) in preparation for the hashing of %p, but that series ended up adding a %px specifier to help with the conversions. Use it, and avoid the "print pointer as an unsigned long" code. Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 8 ++++---- kernel/module.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 531ffa984bc2..d5fa4116688a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -614,14 +614,14 @@ static void s_stop(struct seq_file *m, void *p) static int s_show(struct seq_file *m, void *p) { - unsigned long value; + void *value; struct kallsym_iter *iter = m->private; /* Some debugging symbols have no name. Ignore them. */ if (!iter->name[0]) return 0; - value = iter->show_value ? iter->value : 0; + value = iter->show_value ? (void *)iter->value : NULL; if (iter->module_name[0]) { char type; @@ -632,10 +632,10 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value, + seq_printf(m, "%px %c %s\t[%s]\n", value, type, iter->name, iter->module_name); } else - seq_printf(m, KALLSYM_FMT " %c %s\n", value, + seq_printf(m, "%px %c %s\n", value, iter->type, iter->name); return 0; } diff --git a/kernel/module.c b/kernel/module.c index f0411a271765..dea01ac9cb74 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4157,7 +4157,7 @@ static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); char buf[MODULE_FLAGS_BUF_SIZE]; - unsigned long value; + void *value; /* We always ignore unformed modules. */ if (mod->state == MODULE_STATE_UNFORMED) @@ -4173,8 +4173,8 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - value = m->private ? 0 : (unsigned long)mod->core_layout.base; - seq_printf(m, " 0x" KALLSYM_FMT, value); + value = m->private ? NULL : mod->core_layout.base; + seq_printf(m, " 0x%px", value); /* Taints info */ if (mod->taints) -- cgit v1.2.3 From c8c088ba0edf65044c254b96fc438c91914aaab0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 30 Nov 2017 13:47:54 -0800 Subject: bpf: set maximum number of attached progs to 64 for a single perf tp cgropu+bpf prog array has a maximum number of 64 programs. Let us apply the same limit here. Fixes: e87c6bc3852b ("bpf: permit multiple bpf attachments for a single perf event") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 3 ++- kernel/trace/bpf_trace.c | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b9f8686a84cf..86b50aa26ee8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1447,7 +1447,8 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) - cnt++; + if (*prog != &dummy_bpf_prog.prog) + cnt++; rcu_read_unlock(); return cnt; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 27d1f4ffa3de..0ce99c379c30 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -759,6 +759,8 @@ const struct bpf_prog_ops perf_event_prog_ops = { static DEFINE_MUTEX(bpf_event_mutex); +#define BPF_TRACE_MAX_PROGS 64 + int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { @@ -772,6 +774,12 @@ int perf_event_attach_bpf_prog(struct perf_event *event, goto unlock; old_array = event->tp_event->prog_array; + if (old_array && + bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { + ret = -E2BIG; + goto unlock; + } + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); if (ret < 0) goto unlock; -- cgit v1.2.3 From 5a93bae2c382c588f437ce0395e8032ae287dc36 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Thu, 19 Oct 2017 14:32:33 +0800 Subject: tracing: Fix code comments in trace.c Naming in code comments for tracing_snapshot, tracing_snapshot_alloc and trace_pid_filter_add_remove_task don't match the real function names. And latency_trace has been removed from tracing directory. Fix them. Link: http://lkml.kernel.org/r/1508394753-20887-1-git-send-email-chuhu@redhat.com Fixes: cab5037 ("tracing/ftrace: Enable snapshot function trigger") Fixes: 886b5b7 ("tracing: remove /debug/tracing/latency_trace") Signed-off-by: Chunyu Hu [ Replaced /sys/kernel/debug/tracing with /sys/kerne/tracing ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..5815ec16edd4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct } /** - * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove @@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr) } /** - * trace_snapshot - take a snapshot of the current buffer. + * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live @@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void) EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** - * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * - * This is similar to trace_snapshot(), but it will allocate the + * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * @@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -- cgit v1.2.3 From 90e406f96f630c07d631a021fd4af10aac913e77 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 30 Nov 2017 11:39:43 +0800 Subject: tracing: Allocate mask_str buffer dynamically The default NR_CPUS can be very large, but actual possible nr_cpu_ids usually is very small. For my x86 distribution, the NR_CPUS is 8192 and nr_cpu_ids is 4. About 2 pages are wasted. Most machines don't have so many CPUs, so define a array with NR_CPUS just wastes memory. So let's allocate the buffer dynamically when need. With this change, the mutext tracing_cpumask_update_lock also can be removed now, which was used to protect mask_str. Link: http://lkml.kernel.org/r/1512013183-19107-1-git-send-email-changbin.du@intel.com Fixes: 36dfe9252bd4c ("ftrace: make use of tracing_cpumask") Cc: stable@vger.kernel.org Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5815ec16edd4..9f3f043ba3b7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4178,37 +4178,30 @@ static const struct file_operations show_traces_fops = { .llseek = seq_lseek, }; -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; + char *mask_str; int len; - mutex_lock(&tracing_cpumask_update_lock); + len = snprintf(NULL, 0, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) + return -ENOMEM; - len = snprintf(mask_str, count, "%*pb\n", + len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) { count = -EINVAL; goto out_err; } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); out_err: - mutex_unlock(&tracing_cpumask_update_lock); + kfree(mask_str); return count; } @@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - mutex_lock(&tracing_cpumask_update_lock); - local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { @@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); - - mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); return count; -- cgit v1.2.3 From 2dde6b0034dbc050957cdb6539ce28eca57e8cdf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 3 Nov 2017 11:39:57 +0100 Subject: tracing: make PREEMPTIRQ_EVENTS depend on TRACING When CONFIG_TRACING is disabled, the new preemptirq events tracer produces a build failure: In file included from kernel/trace/trace_irqsoff.c:17:0: kernel/trace/trace.h: In function 'trace_test_and_set_recursion': kernel/trace/trace.h:542:28: error: 'struct task_struct' has no member named 'trace_recursion' Adding an explicit dependency avoids the broken configuration. Link: http://lkml.kernel.org/r/20171103104031.270375-1-arnd@arndb.de Fixes: d59158162e03 ("tracing: Add support for preempt and irq enable/disable events") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..904c952ac383 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS depends on DEBUG_PREEMPT || !PROVE_LOCKING + depends on TRACING default n help Enable tracing of disable and enable events for preemption and irqs. -- cgit v1.2.3 From c4bfd39d7fa5203d4b387c283d360e9a108e85b3 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 17 May 2017 17:14:15 -0700 Subject: ring-buffer: Remove unused function __rb_data_page_index() This fixes the following warning when building with clang: kernel/trace/ring_buffer.c:1842:1: error: unused function '__rb_data_page_index' [-Werror,-Wunused-function] Link: http://lkml.kernel.org/r/20170518001415.5223-1-mka@chromium.org Reviewed-by: Douglas Anderson Signed-off-by: Matthias Kaehlcke Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a95060d..c87766c1c204 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1799,12 +1799,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static __always_inline void * -__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) -{ - return bpage->data + index; -} - static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; -- cgit v1.2.3 From a773d419275bf54854ca6cfda8f2594ed2790faa Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Fri, 2 Jun 2017 13:20:25 +0300 Subject: tracing: Pass export pointer as argument to ->write() By passing an export descriptor to the write function, users don't need to keep a global static pointer and can rely on container_of() to fetch their own structure. Link: http://lkml.kernel.org/r/20170602102025.5140-1-felipe.balbi@linux.intel.com Acked-by: Steven Rostedt (VMware) Reviewed-by: Chunyan Zhang Signed-off-by: Felipe Balbi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3f043ba3b7..59518b8126d0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2415,7 +2415,7 @@ trace_process_export(struct trace_export *export, entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); - export->write(entry, size); + export->write(export, entry, size); } static DEFINE_MUTEX(ftrace_export_lock); -- cgit v1.2.3 From bb5c43428252f27b875c764451321a83a531d6e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Nov 2017 15:40:33 +0100 Subject: genirq/matrix: Fix the precedence fix for real The previous commit which made the operator precedence in irq_matrix_available() explicit made the implicit brokenness explicitely wrong. It was wrong in the original commit already. The overworked maintainer did not notice it either when merging the patch. Replace the confusing '?' construct by a simple and obvious if (). Fixes: 75f1133873d6 ("genirq/matrix: Make - vs ?: Precedence explicit") Reported-by: Rasmus Villemoes Signed-off-by: Thomas Gleixner Cc: Kees Cook --- kernel/irq/matrix.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 7df2480005f8..0ba0dd8863a7 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -384,7 +384,9 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown) { struct cpumap *cm = this_cpu_ptr(m->maps); - return (m->global_available - cpudown) ? cm->available : 0; + if (!cpudown) + return m->global_available; + return m->global_available - cm->available; } /** -- cgit v1.2.3 From 11db855c3d06e82f432cb1bafd73296586d5ceec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Dec 2017 14:41:11 -0800 Subject: Revert "cpuset: Make cpuset hotplug synchronous" This reverts commit 1599a185f0e6113be185b9fb809c621c73865829. This and the previous commit led to another circular locking scenario and the scenario which is fixed by this commit no longer exists after e8b3f8db7aad ("workqueue/hotplug: simplify workqueue_offline_cpu()") which removes work item flushing from hotplug path. Revert it for now. Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 41 +++++++++++++++++++++-------------------- kernel/power/process.c | 2 ++ kernel/sched/core.c | 1 + 3 files changed, 24 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 227bc25d951d..cab5fd1ee767 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2277,8 +2277,15 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** - * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2293,7 +2300,7 @@ retry: * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug(bool use_cpu_hp_lock) +static void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2351,31 +2358,25 @@ static void cpuset_hotplug(bool use_cpu_hp_lock) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) { - if (use_cpu_hp_lock) - rebuild_sched_domains(); - else { - /* Acquiring cpu_hotplug_lock is not required. - * When cpuset_hotplug() is called in hotplug path, - * cpu_hotplug_lock is held by the hotplug context - * which is waiting for cpuhp_thread_fun to indicate - * completion of callback. - */ - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_cpuslocked(); - mutex_unlock(&cpuset_mutex); - } + if (cpus_updated || force_rebuild) { + force_rebuild = false; + rebuild_sched_domains(); } } -static void cpuset_hotplug_workfn(struct work_struct *work) +void cpuset_update_active_cpus(void) { - cpuset_hotplug(true); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + */ + schedule_work(&cpuset_hotplug_work); } -void cpuset_update_active_cpus(void) +void cpuset_wait_for_hotplug(void) { - cpuset_hotplug(false); + flush_work(&cpuset_hotplug_work); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index c326d7235c5f..7381d49a44db 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -204,6 +204,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88b3450b29ab..75554f366fd3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5624,6 +5624,7 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ + cpuset_force_rebuild(); } cpuset_update_active_cpus(); } -- cgit v1.2.3 From e8b3f8db7aad99fcc5234fc5b89984ff6620de3d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Dec 2017 22:20:36 +0800 Subject: workqueue/hotplug: simplify workqueue_offline_cpu() Since the recent cpu/hotplug refactoring, workqueue_offline_cpu() is guaranteed to run on the local cpu which is going offline. This also fixes the following deadlock by removing work item scheduling and flushing from CPU hotplug path. http://lkml.kernel.org/r/1504764252-29091-1-git-send-email-prsood@codeaurora.org tj: Description update. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6a5658cb46da..48a4d00f55dc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1635,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because wq_unbind_fn() releases + * Sanity check nr_running. Because unbind_workers() releases * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. @@ -4511,9 +4511,8 @@ void show_workqueue_state(void) * cpu comes back online. */ -static void wq_unbind_fn(struct work_struct *work) +static void unbind_workers(int cpu) { - int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; @@ -4710,12 +4709,13 @@ int workqueue_online_cpu(unsigned int cpu) int workqueue_offline_cpu(unsigned int cpu) { - struct work_struct unbind_work; struct workqueue_struct *wq; /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); + if (WARN_ON(cpu != smp_processor_id())) + return -1; + + unbind_workers(cpu); /* update NUMA affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); @@ -4723,9 +4723,6 @@ int workqueue_offline_cpu(unsigned int cpu) wq_update_unbound_numa(wq, cpu, false); mutex_unlock(&wq_pool_mutex); - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); return 0; } -- cgit v1.2.3 From 62408c1ef00784e8bcfc4848ade76480fb8aed21 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Dec 2017 22:23:07 +0800 Subject: workqueue/hotplug: remove the workaround in rebind_workers() Since the cpu/hotplug refactoring, DOWN_FAILED is never called without preceding DOWN_PREPARE making the workaround unnecessary. Remove it. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 48a4d00f55dc..45ce93f3dd1f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4589,16 +4589,6 @@ static void rebind_workers(struct worker_pool *pool) spin_lock_irq(&pool->lock); - /* - * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED - * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is - * being reworked and this can go away in time. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - spin_unlock_irq(&pool->lock); - return; - } - pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { -- cgit v1.2.3 From bdfbbda90aeb75ce0951413fd7f495d4d377bd5e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Dec 2017 14:55:59 -0800 Subject: Revert "cgroup/cpuset: remove circular dependency deadlock" This reverts commit aa24163b2ee5c92120e32e99b5a93143a0f4258e. This and the following commit led to another circular locking scenario and the scenario which is fixed by this commit no longer exists after e8b3f8db7aad ("workqueue/hotplug: simplify workqueue_offline_cpu()") which removes work item flushing from hotplug path. Revert it for now. Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 53 ++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cab5fd1ee767..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -812,18 +812,6 @@ done: return ndoms; } -static void cpuset_sched_change_begin(void) -{ - cpus_read_lock(); - mutex_lock(&cpuset_mutex); -} - -static void cpuset_sched_change_end(void) -{ - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); -} - /* * Rebuild scheduler domains. * @@ -833,14 +821,16 @@ static void cpuset_sched_change_end(void) * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * + * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_cpuslocked(void) +static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); + get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -848,25 +838,27 @@ static void rebuild_sched_domains_cpuslocked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + goto out; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); +out: + put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_cpuslocked(void) +static void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - cpuset_sched_change_begin(); - rebuild_sched_domains_cpuslocked(); - cpuset_sched_change_end(); + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); + mutex_unlock(&cpuset_mutex); } /** @@ -952,7 +944,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); } /** @@ -1284,7 +1276,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); } return 0; @@ -1317,6 +1309,7 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ + static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1349,7 +1342,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1617,7 +1610,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1653,7 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); return retval; } @@ -1664,7 +1657,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1677,7 +1670,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); return retval; } @@ -1716,7 +1709,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1740,7 +1733,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2041,14 +2034,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_cpuslocked(). + * will call rebuild_sched_domains_locked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2056,7 +2049,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From c895f6f703ad7dd2f99e751d9884b0aa5d0eea25 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Mon, 4 Dec 2017 10:56:44 +0100 Subject: bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type Commit 0515e5999a466dfe ("bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type") introduced the bpf_perf_event_data structure which exports the pt_regs structure. This is OK for multiple architectures but fail for s390 and arm64 which do not export pt_regs. Programs using them, for example, the bpf selftest fail to compile on these architectures. For s390, exporting the pt_regs is not an option because s390 wants to allow changes to it. For arm64, there is a user_pt_regs structure that covers parts of the pt_regs structure for use by user space. To solve the broken uapi for s390 and arm64, introduce an abstract type for pt_regs and add an asm/bpf_perf_event.h file that concretes the type. An asm-generic header file covers the architectures that export pt_regs today. The arch-specific enablement for s390 and arm64 follows in separate commits. Reported-by: Thomas Richter Fixes: 0515e5999a466dfe ("bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type") Signed-off-by: Hendrik Brueckner Reviewed-and-tested-by: Thomas Richter Acked-by: Alexei Starovoitov Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Arnd Bergmann Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 16beab4767e1..ba957b9812b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7987,11 +7987,11 @@ static void bpf_overflow_handler(struct perf_event *event, { struct bpf_perf_event_data_kern ctx = { .data = data, - .regs = regs, .event = event, }; int ret = 0; + ctx.regs = perf_arch_bpf_user_pt_regs(regs); preempt_disable(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; -- cgit v1.2.3 From b7ad7ef742a99c148631d38a98da72e7db8f6fd0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 5 Dec 2017 08:23:20 -0800 Subject: remove task and stack pointer printout from oops dump Geert Uytterhoeven reported a NFS oops, and pointed out that some of the numbers were hashed and useless. We could just turn them from '%p' into '%px', but those numbers are really just legacy, and useless even when not hashed. So just remove them entirely. Reported-by: Geert Uytterhoeven Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..b9006617710f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3141,9 +3141,6 @@ void dump_stack_print_info(const char *log_lvl) void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - - printk("%stask: %p task.stack: %p\n", - log_lvl, current, task_stack_page(current)); } #endif -- cgit v1.2.3 From 5b1ead6800cb2241aeadcba32736c5836e59c7e1 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 6 Dec 2017 10:59:11 +0000 Subject: cpu/hotplug: Fix state name in takedown_cpu() comment CPUHP_AP_SCHED_MIGRATE_DYING doesn't exist, it looks like this was supposed to refer to CPUHP_AP_SCHED_STARTING's teardown callback, i.e. sched_cpu_dying(). Signed-off-by: Brendan Jackman Cc: Boris Ostrovsky Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171206105911.28093-1-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 04892a82f6ac..2a885c5f2429 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -780,8 +780,8 @@ static int takedown_cpu(unsigned int cpu) BUG_ON(cpu_online(cpu)); /* - * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all - * runnable tasks from the cpu, there's only the idle task left now + * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed + * all runnable tasks from the CPU, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * * Wait for the stop thread to go away. -- cgit v1.2.3 From 5e351ad106997e06b2dc3da9c6b939b95f67fb88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Dec 2017 17:32:47 +0100 Subject: locking/lockdep: Fix possible NULL deref We can't invalidate xhlocks when we've not yet allocated any. Reported-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: f52be5708076 ("locking/lockdep: Untangle xhlock history save/restore from task independence") Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9776da8db180..670d8d7d8087 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4790,7 +4790,8 @@ void lockdep_invariant_state(bool force) * Verify the former, enforce the latter. */ WARN_ON_ONCE(!force && current->lockdep_depth); - invalidate_xhlock(&xhlock(current->xhlock_idx)); + if (current->xhlocks) + invalidate_xhlock(&xhlock(current->xhlock_idx)); } static int cross_lock(struct lockdep_map *lock) -- cgit v1.2.3 From c6b9d9a33029014446bd9ed84c1688f6d3d4eab9 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 5 Dec 2017 23:15:31 -0800 Subject: sched/wait: Fix add_wait_queue() behavioral change The following cleanup commit: 50816c48997a ("sched/wait: Standardize internal naming of wait-queue entries") ... unintentionally changed the behavior of add_wait_queue() from inserting the wait entry at the head of the wait queue to the tail of the wait queue. Beyond a negative performance impact this change in behavior theoretically also breaks wait queues which mix exclusive and non-exclusive waiters, as non-exclusive waiters will not be woken up if they are queued behind enough exclusive waiters. Signed-off-by: Omar Sandoval Reviewed-by: Jens Axboe Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Cc: kernel-team@fb.com Fixes: ("sched/wait: Standardize internal naming of wait-queue entries") Link: http://lkml.kernel.org/r/a16c8ccffd39bd08fdaa45a5192294c784b803a7.1512544324.git.osandov@fb.com Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 98feab7933c7..929ecb7d6b78 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - __add_wait_queue_entry_tail(wq_head, wq_entry); + __add_wait_queue(wq_head, wq_entry); spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue); -- cgit v1.2.3 From a4c3c04974d648ee6e1a09ef4131eb32a02ab494 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 16 Nov 2017 15:21:52 +0100 Subject: sched/fair: Update and fix the runnable propagation rule Unlike running, the runnable part can't be directly propagated through the hierarchy when we migrate a task. The main reason is that runnable time can be shared with other sched_entities that stay on the rq and this runnable time will also remain on prev cfs_rq and must not be removed. Instead, we can estimate what should be the new runnable of the prev cfs_rq and check that this estimation stay in a possible range. The prop_runnable_sum is a good estimation when adding runnable_sum but fails most often when we remove it. Instead, we could use the formula below instead: gcfs_rq's runnable_sum = gcfs_rq->avg.load_sum / gcfs_rq->load.weight which assumes that tasks are equally runnable which is not true but easy to compute. Beside these estimates, we have several simple rules that help us to filter out wrong ones: - ge->avg.runnable_sum <= than LOAD_AVG_MAX - ge->avg.runnable_sum >= ge->avg.running_sum (ge->avg.util_sum << LOAD_AVG_MAX) - ge->avg.runnable_sum can't increase when we detach a task The effect of these fixes is better cgroups balancing. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Chris Mason Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: Yuyang Du Link: http://lkml.kernel.org/r/1510842112-21028-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 102 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4037e19bbca2..2fe3aa853e4d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3413,9 +3413,9 @@ void set_task_rq_fair(struct sched_entity *se, * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * - * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and - * simply copies the running sum over. + * Per the above update_tg_cfs_util() is trivial and simply copies the running + * sum over (but still wrong, because the group entity and group rq do not have + * their PELT windows aligned). * * However, update_tg_cfs_runnable() is more complex. So we have: * @@ -3424,11 +3424,11 @@ void set_task_rq_fair(struct sched_entity *se, * And since, like util, the runnable part should be directly transferable, * the following would _appear_ to be the straight forward approach: * - * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) + * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3) * * And per (1) we have: * - * ge->avg.running_avg == grq->avg.running_avg + * ge->avg.runnable_avg == grq->avg.runnable_avg * * Which gives: * @@ -3447,27 +3447,28 @@ void set_task_rq_fair(struct sched_entity *se, * to (shortly) return to us. This only works by keeping the weights as * integral part of the sum. We therefore cannot decompose as per (3). * - * OK, so what then? + * Another reason this doesn't work is that runnable isn't a 0-sum entity. + * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the + * rq itself is runnable anywhere between 2/3 and 1 depending on how the + * runnable section of these tasks overlap (or not). If they were to perfectly + * align the rq as a whole would be runnable 2/3 of the time. If however we + * always have at least 1 runnable task, the rq as a whole is always runnable. * + * So we'll have to approximate.. :/ * - * Another way to look at things is: + * Given the constraint: * - * grq->avg.load_avg = \Sum se->avg.load_avg + * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX * - * Therefore, per (2): + * We can construct a rule that adds runnable to a rq by assuming minimal + * overlap. * - * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg + * On removal, we'll assume each task is equally runnable; which yields: * - * And the very thing we're propagating is a change in that sum (someone - * joined/left). So we can easily know the runnable change, which would be, per - * (2) the already tracked se->load_avg divided by the corresponding - * se->weight. + * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight * - * Basically (4) but in differential form: + * XXX: only do this for the part of runnable > running ? * - * d(runnable_avg) += se->avg.load_avg / se->load.weight - * (5) - * ge->avg.load_avg += ge->load.weight * d(runnable_avg) */ static inline void @@ -3479,6 +3480,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq if (!delta) return; + /* + * The relation between sum and avg is: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * however, the PELT windows are not aligned between grq and gse. + */ + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; @@ -3491,33 +3500,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long runnable_sum = gcfs_rq->prop_runnable_sum; - long runnable_load_avg, load_avg; - s64 runnable_load_sum, load_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + unsigned long runnable_load_avg, load_avg; + u64 runnable_load_sum, load_sum = 0; + s64 delta_sum; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + if (runnable_sum >= 0) { + /* + * Add runnable; clip at LOAD_AVG_MAX. Reflects that until + * the CPU is saturated running == runnable. + */ + runnable_sum += se->avg.load_sum; + runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + } else { + /* + * Estimate the new unweighted runnable_sum of the gcfs_rq by + * assuming all tasks are equally runnable. + */ + if (scale_load_down(gcfs_rq->load.weight)) { + load_sum = div_s64(gcfs_rq->avg.load_sum, + scale_load_down(gcfs_rq->load.weight)); + } + + /* But make sure to not inflate se's runnable */ + runnable_sum = min(se->avg.load_sum, load_sum); + } + + /* + * runnable_sum can't be lower than running_sum + * As running sum is scale with cpu capacity wehreas the runnable sum + * is not we rescale running_sum 1st + */ + running_sum = se->avg.util_sum / + arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + runnable_sum = max(runnable_sum, running_sum); + load_sum = (s64)se_weight(se) * runnable_sum; load_avg = div_s64(load_sum, LOAD_AVG_MAX); - add_positive(&se->avg.load_sum, runnable_sum); - add_positive(&se->avg.load_avg, load_avg); + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; + delta_avg = load_avg - se->avg.load_avg; - add_positive(&cfs_rq->avg.load_avg, load_avg); - add_positive(&cfs_rq->avg.load_sum, load_sum); + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); + delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - add_positive(&se->avg.runnable_load_sum, runnable_sum); - add_positive(&se->avg.runnable_load_avg, runnable_load_avg); + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { - add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); - add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); + add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } } -- cgit v1.2.3 From c07d35338081d107e57cf37572d8cc931a8e32e2 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Mon, 2 Mar 2015 14:13:36 +0000 Subject: kdb: Fix handling of kallsyms_symbol_next() return value kallsyms_symbol_next() returns a boolean (true on success). Currently kdb_read() tests the return value with an inequality that unconditionally evaluates to true. This is fixed in the obvious way and, since the conditional branch is supposed to be unreachable, we also add a WARN_ON(). Reported-by: Dan Carpenter Signed-off-by: Daniel Thompson Cc: linux-stable Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index e74be38245ad..ed5d34925ad0 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -350,7 +350,7 @@ poll_again: } kdb_printf("\n"); for (i = 0; i < count; i++) { - if (kallsyms_symbol_next(p_tmp, i) < 0) + if (WARN_ON(!kallsyms_symbol_next(p_tmp, i))) break; kdb_printf("%s ", p_tmp); *(p_tmp + len) = '\0'; -- cgit v1.2.3 From d70ef22892ed6c066e51e118b225923c9b74af34 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 30 Nov 2017 15:35:44 +0100 Subject: futex: futex_wake_op, fix sign_extend32 sign bits sign_extend32 counts the sign bit parameter from 0, not from 1. So we have to use "11" for 12th bit, not "12". This mistake means we have not allowed negative op and cmp args since commit 30d6e0a4190d ("futex: Remove duplicated code and fix undefined behaviour") till now. Fixes: 30d6e0a4190d ("futex: Remove duplicated code and fix undefined behaviour") Signed-off-by: Jiri Slaby Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Darren Hart Signed-off-by: Linus Torvalds --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 76ed5921117a..57d0b3657e16 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1582,8 +1582,8 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; - int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); - int cmparg = sign_extend32(encoded_op & 0x00000fff, 12); + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); + int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { -- cgit v1.2.3 From 2064a5ab04707c55003e099e5abbf19a0826bbac Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 3 Dec 2017 13:19:00 -0800 Subject: sched/core: Fix kernel-doc warnings after code movement Fix the following kernel-doc warnings after code restructuring: ../kernel/sched/core.c:5113: warning: No description found for parameter 't' ../kernel/sched/core.c:5113: warning: Excess function parameter 'interval' description in 'sched_rr_get_interval' get rid of set_fs()") Signed-off-by: Randy Dunlap Cc: Al Viro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: abca5fc535a3e ("sched_rr_get_interval(): move compat to native, Link: http://lkml.kernel.org/r/995c6ded-b32e-bbe4-d9f5-4d42d121aff1@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..644fa2e3d993 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) return ret; } -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; @@ -5144,6 +5133,17 @@ out_unlock: return retval; } +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. + */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { -- cgit v1.2.3 From 01dfee9582d9b4403c4902df096ed8b43d55181c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 8 Dec 2017 11:56:14 +0900 Subject: workqueue: remove unneeded kallsyms include The filw was converted from print_symbol() to %pf some time ago (044c782ce3a901fb "workqueue: fix checkpatch issues"). kallsyms does not seem to be needed anymore. Signed-off-by: Sergey Senozhatsky Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 45ce93f3dd1f..43d18cb46308 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From f87f3a328dbbb3e79dd53e7e889ced9222512649 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 28 Nov 2017 18:42:18 +0000 Subject: locking/core: Fix deadlock during boot on systems with GENERIC_LOCKBREAK Commit: a8a217c22116 ("locking/core: Remove {read,spin,write}_can_lock()") removed the definition of raw_spin_can_lock(), causing the GENERIC_LOCKBREAK spin_lock() routines to poll the ->break_lock field when waiting on a lock. This has been reported to cause a deadlock during boot on s390, because the ->break_lock field is also set by the waiters, and can potentially remain set indefinitely if no other CPUs come in to take the lock after it has been released. This patch removes the explicit spinning on ->break_lock from the waiters, instead relying on the outer trylock() operation to determine when the lock is available. Reported-by: Sebastian Ott Tested-by: Sebastian Ott Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Thomas Gleixner Fixes: a8a217c22116 ("locking/core: Remove {read,spin,write}_can_lock()") Link: http://lkml.kernel.org/r/1511894539-7988-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/spinlock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 1fd1a7543cdd..0ebb253e2199 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -68,8 +68,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + \ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ @@ -88,8 +88,8 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + \ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ -- cgit v1.2.3 From d89c70356acf11b7cf47ca5cfcafae5062a85451 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 28 Nov 2017 18:42:19 +0000 Subject: locking/core: Remove break_lock field when CONFIG_GENERIC_LOCKBREAK=y When CONFIG_GENERIC_LOCKBEAK=y, locking structures grow an extra int ->break_lock field which is used to implement raw_spin_is_contended() by setting the field to 1 when waiting on a lock and clearing it to zero when holding a lock. However, there are a few problems with this approach: - There is a write-write race between a CPU successfully taking the lock (and subsequently writing break_lock = 0) and a waiter waiting on the lock (and subsequently writing break_lock = 1). This could result in a contended lock being reported as uncontended and vice-versa. - On machines with store buffers, nothing guarantees that the writes to break_lock are visible to other CPUs at any particular time. - READ_ONCE/WRITE_ONCE are not used, so the field is potentially susceptible to harmful compiler optimisations, Consequently, the usefulness of this field is unclear and we'd be better off removing it and allowing architectures to implement raw_spin_is_contended() by providing a definition of arch_spin_is_contended(), as they can when CONFIG_GENERIC_LOCKBREAK=n. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Sebastian Ott Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1511894539-7988-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/spinlock.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 0ebb253e2199..936f3d14dd6b 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ break; \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - \ arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ } \ \ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ @@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ local_irq_restore(flags); \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - \ arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ + \ return flags; \ } \ \ -- cgit v1.2.3 From e966eaeeb623f09975ef362c2866fae6f86844f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 12 Dec 2017 12:31:16 +0100 Subject: locking/lockdep: Remove the cross-release locking checks This code (CONFIG_LOCKDEP_CROSSRELEASE=y and CONFIG_LOCKDEP_COMPLETIONS=y), while it found a number of old bugs initially, was also causing too many false positives that caused people to disable lockdep - which is arguably a worse overall outcome. If we disable cross-release by default but keep the code upstream then in practice the most likely outcome is that we'll allow the situation to degrade gradually, by allowing entropy to introduce more and more false positives, until it overwhelms maintenance capacity. Another bad side effect was that people were trying to work around the false positives by uglifying/complicating unrelated code. There's a marked difference between annotating locking operations and uglifying good code just due to bad lock debugging code ... This gradual decrease in quality happened to a number of debugging facilities in the kernel, and lockdep is pretty complex already, so we cannot risk this outcome. Either cross-release checking can be done right with no false positives, or it should not be included in the upstream kernel. ( Note that it might make sense to maintain it out of tree and go through the false positives every now and then and see whether new bugs were introduced. ) Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 652 +++-------------------------------------------- 1 file changed, 35 insertions(+), 617 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 670d8d7d8087..5fa1324a4f29 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,6 @@ #define CREATE_TRACE_POINTS #include -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#include -#endif - #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -75,19 +71,6 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif -#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK -static int crossrelease_fullstack = 1; -#else -static int crossrelease_fullstack; -#endif -static int __init allow_crossrelease_fullstack(char *str) -{ - crossrelease_fullstack = 1; - return 0; -} - -early_param("crossrelease_fullstack", allow_crossrelease_fullstack); - /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -740,18 +723,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -static void cross_init(struct lockdep_map *lock, int cross); -static int cross_lock(struct lockdep_map *lock); -static int lock_acquire_crosslock(struct held_lock *hlock); -static int lock_release_crosslock(struct lockdep_map *lock); -#else -static inline void cross_init(struct lockdep_map *lock, int cross) {} -static inline int cross_lock(struct lockdep_map *lock) { return 0; } -static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } -static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } -#endif - /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1151,41 +1122,22 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - if (cross_lock(tgt->instance)) { - printk(" Possible unsafe locking scenario by crosslock:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk(" unlock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } else { - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); } /* @@ -1211,10 +1163,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, curr->comm, task_pid_nr(curr)); print_lock(check_src); - if (cross_lock(check_tgt->instance)) - pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); - else - pr_warn("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); @@ -1244,9 +1193,7 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (cross_lock(check_tgt->instance)) - this->trace = *trace; - else if (!save_trace(&this->trace)) + if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1850,9 +1797,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - if (cross_lock(prev->instance)) - continue; - return print_deadlock_bug(curr, prev, next); } return 1; @@ -2018,31 +1962,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; + /* - * Only non-crosslock entries get new dependencies added. - * Crosslock entries will be added by commit later: + * Only non-recursive-read entries get new dependencies + * added: */ - if (!cross_lock(hlock->instance)) { + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + if (!ret) + return 0; + /* - * Only non-recursive-read entries get new dependencies - * added: + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, - distance, &trace, save_trace); - if (!ret) - return 0; - - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } + if (!hlock->trylock) + break; } + depth--; /* * End of lock-stack? @@ -3292,21 +3231,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - cross_init(lock, 0); __lockdep_init_map(lock, name, key, subclass); } EXPORT_SYMBOL_GPL(lockdep_init_map); -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - cross_init(lock, 1); - __lockdep_init_map(lock, name, key, subclass); -} -EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); -#endif - struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3362,7 +3290,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; - int ret; if (unlikely(!debug_locks)) return 0; @@ -3411,8 +3338,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - /* TODO: nest_lock is not implemented for crosslock yet. */ - if (depth && !cross_lock(lock)) { + if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3500,14 +3426,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; - ret = lock_acquire_crosslock(hlock); - /* - * 2 means normal acquire operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3745,19 +3663,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int ret, i; + int i; if (unlikely(!debug_locks)) return 0; - ret = lock_release_crosslock(lock); - /* - * 2 means normal release operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -4675,495 +4585,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); - -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - -/* - * Crossrelease works by recording a lock history for each thread and - * connecting those historic locks that were taken after the - * wait_for_completion() in the complete() context. - * - * Task-A Task-B - * - * mutex_lock(&A); - * mutex_unlock(&A); - * - * wait_for_completion(&C); - * lock_acquire_crosslock(); - * atomic_inc_return(&cross_gen_id); - * | - * | mutex_lock(&B); - * | mutex_unlock(&B); - * | - * | complete(&C); - * `-- lock_commit_crosslock(); - * - * Which will then add a dependency between B and C. - */ - -#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) - -/* - * Whenever a crosslock is held, cross_gen_id will be increased. - */ -static atomic_t cross_gen_id; /* Can be wrapped */ - -/* - * Make an entry of the ring buffer invalid. - */ -static inline void invalidate_xhlock(struct hist_lock *xhlock) -{ - /* - * Normally, xhlock->hlock.instance must be !NULL. - */ - xhlock->hlock.instance = NULL; -} - -/* - * Lock history stacks; we have 2 nested lock history stacks: - * - * HARD(IRQ) - * SOFT(IRQ) - * - * The thing is that once we complete a HARD/SOFT IRQ the future task locks - * should not depend on any of the locks observed while running the IRQ. So - * what we do is rewind the history buffer and erase all our knowledge of that - * temporal event. - */ - -void crossrelease_hist_start(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (!cur->xhlocks) - return; - - cur->xhlock_idx_hist[c] = cur->xhlock_idx; - cur->hist_id_save[c] = cur->hist_id; -} - -void crossrelease_hist_end(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (cur->xhlocks) { - unsigned int idx = cur->xhlock_idx_hist[c]; - struct hist_lock *h = &xhlock(idx); - - cur->xhlock_idx = idx; - - /* Check if the ring was overwritten. */ - if (h->hist_id != cur->hist_id_save[c]) - invalidate_xhlock(h); - } -} - -/* - * lockdep_invariant_state() is used to annotate independence inside a task, to - * make one task look like multiple independent 'tasks'. - * - * Take for instance workqueues; each work is independent of the last. The - * completion of a future work does not depend on the completion of a past work - * (in general). Therefore we must not carry that (lock) dependency across - * works. - * - * This is true for many things; pretty much all kthreads fall into this - * pattern, where they have an invariant state and future completions do not - * depend on past completions. Its just that since they all have the 'same' - * form -- the kthread does the same over and over -- it doesn't typically - * matter. - * - * The same is true for system-calls, once a system call is completed (we've - * returned to userspace) the next system call does not depend on the lock - * history of the previous system call. - * - * They key property for independence, this invariant state, is that it must be - * a point where we hold no locks and have no history. Because if we were to - * hold locks, the restore at _end() would not necessarily recover it's history - * entry. Similarly, independence per-definition means it does not depend on - * prior state. - */ -void lockdep_invariant_state(bool force) -{ - /* - * We call this at an invariant point, no current state, no history. - * Verify the former, enforce the latter. - */ - WARN_ON_ONCE(!force && current->lockdep_depth); - if (current->xhlocks) - invalidate_xhlock(&xhlock(current->xhlock_idx)); -} - -static int cross_lock(struct lockdep_map *lock) -{ - return lock ? lock->cross : 0; -} - -/* - * This is needed to decide the relationship between wrapable variables. - */ -static inline int before(unsigned int a, unsigned int b) -{ - return (int)(a - b) < 0; -} - -static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) -{ - return hlock_class(&xhlock->hlock); -} - -static inline struct lock_class *xlock_class(struct cross_lock *xlock) -{ - return hlock_class(&xlock->hlock); -} - -/* - * Should we check a dependency with previous one? - */ -static inline int depend_before(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check && !hlock->trylock; -} - -/* - * Should we check a dependency with next one? - */ -static inline int depend_after(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check; -} - -/* - * Check if the xhlock is valid, which would be false if, - * - * 1. Has not used after initializaion yet. - * 2. Got invalidated. - * - * Remind hist_lock is implemented as a ring buffer. - */ -static inline int xhlock_valid(struct hist_lock *xhlock) -{ - /* - * xhlock->hlock.instance must be !NULL. - */ - return !!xhlock->hlock.instance; -} - -/* - * Record a hist_lock entry. - * - * Irq disable is only required. - */ -static void add_xhlock(struct held_lock *hlock) -{ - unsigned int idx = ++current->xhlock_idx; - struct hist_lock *xhlock = &xhlock(idx); - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * This can be done locklessly because they are all task-local - * state, we must however ensure IRQs are disabled. - */ - WARN_ON_ONCE(!irqs_disabled()); -#endif - - /* Initialize hist_lock's members */ - xhlock->hlock = *hlock; - xhlock->hist_id = ++current->hist_id; - - xhlock->trace.nr_entries = 0; - xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; - xhlock->trace.entries = xhlock->trace_entries; - - if (crossrelease_fullstack) { - xhlock->trace.skip = 3; - save_stack_trace(&xhlock->trace); - } else { - xhlock->trace.nr_entries = 1; - xhlock->trace.entries[0] = hlock->acquire_ip; - } -} - -static inline int same_context_xhlock(struct hist_lock *xhlock) -{ - return xhlock->hlock.irq_context == task_irq_context(current); -} - -/* - * This should be lockless as far as possible because this would be - * called very frequently. - */ -static void check_add_xhlock(struct held_lock *hlock) -{ - /* - * Record a hist_lock, only in case that acquisitions ahead - * could depend on the held_lock. For example, if the held_lock - * is trylock then acquisitions ahead never depends on that. - * In that case, we don't need to record it. Just return. - */ - if (!current->xhlocks || !depend_before(hlock)) - return; - - add_xhlock(hlock); -} - -/* - * For crosslock. - */ -static int add_xlock(struct held_lock *hlock) -{ - struct cross_lock *xlock; - unsigned int gen_id; - - if (!graph_lock()) - return 0; - - xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; - - /* - * When acquisitions for a crosslock are overlapped, we use - * nr_acquire to perform commit for them, based on cross_gen_id - * of the first acquisition, which allows to add additional - * dependencies. - * - * Moreover, when no acquisition of a crosslock is in progress, - * we should not perform commit because the lock might not exist - * any more, which might cause incorrect memory access. So we - * have to track the number of acquisitions of a crosslock. - * - * depend_after() is necessary to initialize only the first - * valid xlock so that the xlock can be used on its commit. - */ - if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) - goto unlock; - - gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); - xlock->hlock = *hlock; - xlock->hlock.gen_id = gen_id; -unlock: - graph_unlock(); - return 1; -} - -/* - * Called for both normal and crosslock acquires. Normal locks will be - * pushed on the hist_lock queue. Cross locks will record state and - * stop regular lock_acquire() to avoid being placed on the held_lock - * stack. - * - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_acquire_crosslock(struct held_lock *hlock) -{ - /* - * CONTEXT 1 CONTEXT 2 - * --------- --------- - * lock A (cross) - * X = atomic_inc_return(&cross_gen_id) - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * Y = atomic_read_acquire(&cross_gen_id) - * lock B - * - * atomic_read_acquire() is for ordering between A and B, - * IOW, A happens before B, when CONTEXT 2 see Y >= X. - * - * Pairs with atomic_inc_return() in add_xlock(). - */ - hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); - - if (cross_lock(hlock->instance)) - return add_xlock(hlock); - - check_add_xhlock(hlock); - return 2; -} - -static int copy_trace(struct stack_trace *trace) -{ - unsigned long *buf = stack_trace + nr_stack_trace_entries; - unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - unsigned int nr = min(max_nr, trace->nr_entries); - - trace->nr_entries = nr; - memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); - trace->entries = buf; - nr_stack_trace_entries += nr; - - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); - dump_stack(); - - return 0; - } - - return 1; -} - -static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) -{ - unsigned int xid, pid; - u64 chain_key; - - xid = xlock_class(xlock) - lock_classes; - chain_key = iterate_chain_key((u64)0, xid); - pid = xhlock_class(xhlock) - lock_classes; - chain_key = iterate_chain_key(chain_key, pid); - - if (lookup_chain_cache(chain_key)) - return 1; - - if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, - chain_key)) - return 0; - - if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, - &xhlock->trace, copy_trace)) - return 0; - - return 1; -} - -static void commit_xhlocks(struct cross_lock *xlock) -{ - unsigned int cur = current->xhlock_idx; - unsigned int prev_hist_id = xhlock(cur).hist_id; - unsigned int i; - - if (!graph_lock()) - return; - - if (xlock->nr_acquire) { - for (i = 0; i < MAX_XHLOCKS_NR; i++) { - struct hist_lock *xhlock = &xhlock(cur - i); - - if (!xhlock_valid(xhlock)) - break; - - if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) - break; - - if (!same_context_xhlock(xhlock)) - break; - - /* - * Filter out the cases where the ring buffer was - * overwritten and the current entry has a bigger - * hist_id than the previous one, which is impossible - * otherwise: - */ - if (unlikely(before(prev_hist_id, xhlock->hist_id))) - break; - - prev_hist_id = xhlock->hist_id; - - /* - * commit_xhlock() returns 0 with graph_lock already - * released if fail. - */ - if (!commit_xhlock(xlock, xhlock)) - return; - } - } - - graph_unlock(); -} - -void lock_commit_crosslock(struct lockdep_map *lock) -{ - struct cross_lock *xlock; - unsigned long flags; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (!current->xhlocks) - return; - - /* - * Do commit hist_locks with the cross_lock, only in case that - * the cross_lock could depend on acquisitions after that. - * - * For example, if the cross_lock does not have the 'check' flag - * then we don't need to check dependencies and commit for that. - * Just skip it. In that case, of course, the cross_lock does - * not depend on acquisitions ahead, either. - * - * WARNING: Don't do that in add_xlock() in advance. When an - * acquisition context is different from the commit context, - * invalid(skipped) cross_lock might be accessed. - */ - if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - xlock = &((struct lockdep_map_cross *)lock)->xlock; - commit_xhlocks(xlock); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_commit_crosslock); - -/* - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_release_crosslock(struct lockdep_map *lock) -{ - if (cross_lock(lock)) { - if (!graph_lock()) - return 0; - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; - graph_unlock(); - return 1; - } - return 2; -} - -static void cross_init(struct lockdep_map *lock, int cross) -{ - if (cross) - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; - - lock->cross = cross; - - /* - * Crossrelease assumes that the ring buffer size of xhlocks - * is aligned with power of 2. So force it on build. - */ - BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); -} - -void lockdep_init_task(struct task_struct *task) -{ - int i; - - task->xhlock_idx = UINT_MAX; - task->hist_id = 0; - - for (i = 0; i < XHLOCK_CTX_NR; i++) { - task->xhlock_idx_hist[i] = UINT_MAX; - task->hist_id_save[i] = 0; - } - - task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, - GFP_KERNEL); -} - -void lockdep_free_task(struct task_struct *task) -{ - if (task->xhlocks) { - void *tmp = task->xhlocks; - /* Diable crossrelease for current */ - task->xhlocks = NULL; - kfree(tmp); - } -} -#endif -- cgit v1.2.3 From 283ca526a9bd75aed7350220d7b1f8027d99c3fd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Dec 2017 02:25:30 +0100 Subject: bpf: fix corruption on concurrent perf_event_output calls When tracing and networking programs are both attached in the system and both use event-output helpers that eventually call into perf_event_output(), then we could end up in a situation where the tracing attached program runs in user context while a cls_bpf program is triggered on that same CPU out of softirq context. Since both rely on the same per-cpu perf_sample_data, we could potentially corrupt it. This can only ever happen in a combination of the two types; all tracing programs use a bpf_prog_active counter to bail out in case a program is already running on that CPU out of a different context. XDP and cls_bpf programs by themselves don't have this issue as they run in the same context only. Therefore, split both perf_sample_data so they cannot be accessed from each other. Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Song Liu Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ce99c379c30..40207c2a4113 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_raw_record *raw) + u64 flags, struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; @@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(sd, 0, 0); - sd->raw = raw; perf_event_output(event, sd, regs); return 0; } @@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); struct perf_raw_record raw = { .frag = { .size = size, @@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; - return __bpf_perf_event_output(regs, map, flags, &raw); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; + + return __bpf_perf_event_output(regs, map, flags, sd); } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); struct perf_raw_frag frag = { .copy = ctx_copy, @@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; perf_fetch_caller_regs(regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, &raw); + return __bpf_perf_event_output(regs, map, flags, sd); } BPF_CALL_0(bpf_get_current_task) -- cgit v1.2.3 From 9147efcbe0b7cc96b18eb64b1a3f0d4bba81443c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Dec 2017 14:22:39 -0800 Subject: bpf: add schedule points to map alloc/free While using large percpu maps, htab_map_alloc() can hold cpu for hundreds of ms. This patch adds cond_resched() calls to percpu alloc/free call sites, all running in process context. Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e469e05c8e83..3905d4bc5b80 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab) pptr = htab_elem_get_ptr(get_htab_elem(htab, i), htab->map.key_size); free_percpu(pptr); + cond_resched(); } free_elems: bpf_map_area_free(htab->elems); @@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr); + cond_resched(); } skip_percpu_elems: -- cgit v1.2.3 From 689d77f001cd22da31cc943170e1f6f2e8197035 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 14 Dec 2017 15:33:02 -0800 Subject: kcov: fix comparison callback signature Fix a silly copy-paste bug. We truncated u32 args to u16. Link: http://lkml.kernel.org/r/20171207101134.107168-1-dvyukov@google.com Fixes: ded97d2c2b2c ("kcov: support comparison operands collection") Signed-off-by: Dmitry Vyukov Cc: syzkaller@googlegroups.com Cc: Alexander Potapenko Cc: Vegard Nossum Cc: Quentin Casasnovas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 15f33faf4013..7594c033d98a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); -void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); } @@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); -void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, _RET_IP_); -- cgit v1.2.3 From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 5 +++-- kernel/uid16.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); -- cgit v1.2.3 From 7c2c11b208be09c156573fc0076b7b3646e05219 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 14 Dec 2017 15:33:19 -0800 Subject: arch: define weak abort() gcc toggle -fisolate-erroneous-paths-dereference (default at -O2 onwards) isolates faulty code paths such as null pointer access, divide by zero etc. If gcc port doesnt implement __builtin_trap, an abort() is generated which causes kernel link error. In this case, gcc is generating abort due to 'divide by zero' in lib/mpi/mpih-div.c. Currently 'frv' and 'arc' are failing. Previously other arch was also broken like m32r was fixed by commit d22e3d69ee1a ("m32r: fix build failure"). Let's define this weak function which is common for all arch and fix the problem permanently. We can even remove the arch specific 'abort' after this is done. Link: http://lkml.kernel.org/r/1513118956-8718-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Cc: Alexey Brodkin Cc: Vineet Gupta Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 6b4298a41167..df0c91d5606c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1755,3 +1755,11 @@ Efault: return -EFAULT; } #endif + +__weak void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +} -- cgit v1.2.3 From b00d607bb188e187c7b60074d2fa91a6f1985029 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 5 Dec 2017 04:41:51 -0500 Subject: tracing: Have stack trace not record if RCU is not watching The stack tracer records a stack dump whenever it sees a stack usage that is more than what it ever saw before. This can happen at any function that is being traced. If it happens when the CPU is going idle (or other strange locations), RCU may not be watching, and in this case, the recording of the stack trace will trigger a warning. There's been lots of efforts to make hacks to allow stack tracing to proceed even if RCU is not watching, but this only causes more issues to appear. Simply do not trace a stack if RCU is not watching. It probably isn't a bad stack anyway. Acked-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 734accc02418..3c7bfc4bf5e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (__this_cpu_read(disable_stack_tracer) != 1) goto out; + /* If rcu is not watching, then save stack trace can fail */ + if (!rcu_is_watching()) + goto out; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); -- cgit v1.2.3 From cef31d9af908243421258f1df35a4a644604efbe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Dec 2017 10:32:03 +0100 Subject: posix-timer: Properly check sigevent->sigev_notify timer_create() specifies via sigevent->sigev_notify the signal delivery for the new timer. The valid modes are SIGEV_NONE, SIGEV_SIGNAL, SIGEV_THREAD and (SIGEV_SIGNAL | SIGEV_THREAD_ID). The sanity check in good_sigevent() is only checking the valid combination for the SIGEV_THREAD_ID bit, i.e. SIGEV_SIGNAL, but if SIGEV_THREAD_ID is not set it accepts any random value. This has no real effects on the posix timer and signal delivery code, but it affects show_timer() which handles the output of /proc/$PID/timers. That function uses a string array to pretty print sigev_notify. The access to that array has no bound checks, so random sigev_notify cause access beyond the array bounds. Add proper checks for the valid notify modes and remove the SIGEV_THREAD_ID masking from various code pathes as SIGEV_NONE can never be set in combination with SIGEV_THREAD_ID. Reported-by: Eric Biggers Reported-by: Dmitry Vyukov Reported-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: stable@vger.kernel.org --- kernel/time/posix-timers.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 13d6881f908b..ec999f32c840 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + rtn = find_task_by_vpid(event->sigev_notify_thread_id); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + /* FALLTHRU */ + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + /* FALLTHRU */ + case SIGEV_NONE: + return task_pid(rtn); + default: return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); + } } static struct k_itimer * alloc_posix_timer(void) @@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sig_none = timr->it_sigev_notify == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ @@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); - sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); timr->it_active = !sigev_none; -- cgit v1.2.3 From f73c52a5bcd1710994e53fbccc378c42b97a06b6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 2 Dec 2017 13:04:54 -0500 Subject: sched/rt: Do not pull from current CPU if only one CPU to pull Daniel Wagner reported a crash on the BeagleBone Black SoC. This is a single CPU architecture, and does not have a functional arch_send_call_function_single_ipi() implementation which can crash the kernel if that is called. As it only has one CPU, it shouldn't be called, but if the kernel is compiled for SMP, the push/pull RT scheduling logic now calls it for irq_work if the one CPU is overloaded, it can use that function to call itself and crash the kernel. Ideally, we should disable the SCHED_FEAT(RT_PUSH_IPI) if the system only has a single CPU. But SCHED_FEAT is a constant if sched debugging is turned off. Another fix can also be used, and this should also help with normal SMP machines. That is, do not initiate the pull code if there's only one RT overloaded CPU, and that CPU happens to be the current CPU that is scheduling in a lower priority task. Even on a system with many CPUs, if there's many RT tasks waiting to run on a single CPU, and that CPU schedules in another RT task of lower priority, it will initiate the PULL logic in case there's a higher priority RT task on another CPU that is waiting to run. But if there is no other CPU with waiting RT tasks, it will initiate the RT pull logic on itself (as it still has RT tasks waiting to run). This is a wasted effort. Not only does this help with SMP code where the current CPU is the only one with RT overloaded tasks, it should also solve the issue that Daniel encountered, because it will prevent the PULL logic from executing, as there's only one CPU on the system, and the check added here will cause it to exit the RT pull code. Reported-by: Daniel Wagner Signed-off-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: linux-rt-users Cc: stable@vger.kernel.org Fixes: 4bdced5c9 ("sched/rt: Simplify the IPI based RT balancing logic") Link: http://lkml.kernel.org/r/20171202130454.4cbbfe8d@vmware.local.home Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19ca3f0..665ace2fc558 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq) bool resched = false; struct task_struct *p; struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overloaded(this_rq))) + if (likely(!rt_overload_count)) return; /* @@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq) */ smp_rmb(); + /* If we are the only overloaded CPU do nothing */ + if (rt_overload_count == 1 && + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) + return; + #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); -- cgit v1.2.3