From 488603b815a7514c7009e6fc339d74ed4a30f343 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Sat, 11 Jan 2020 04:53:38 -0500 Subject: sched/core: Don't skip remote tick for idle CPUs This will be used in the next patch to get a loadavg update from nohz cpus. The delta check is skipped because idle_sched_class doesn't update se.exec_start. Signed-off-by: Scott Wood Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/1578736419-14628-2-git-send-email-swood@redhat.com --- kernel/sched/core.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc1dfc007604..cf8b33dc4513 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3669,22 +3669,24 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + if (!tick_nohz_tick_stopped_cpu(cpu)) goto out_requeue; rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr) || cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); - delta = rq_clock_task(rq) - curr->se.exec_start; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } curr->sched_class->task_tick(rq, curr, 0); out_unlock: -- cgit v1.2.3 From ebc0f83c78a2d26384401ecf2d2fa48063c0ee27 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Sat, 11 Jan 2020 04:53:39 -0500 Subject: timers/nohz: Update NOHZ load in remote tick The way loadavg is tracked during nohz only pays attention to the load upon entering nohz. This can be particularly noticeable if full nohz is entered while non-idle, and then the cpu goes idle and stays that way for a long time. Use the remote tick to ensure that full nohz cpus report their deltas within a reasonable time. [ swood: Added changelog and removed recheck of stopped tick. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Scott Wood Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/1578736419-14628-3-git-send-email-swood@redhat.com --- kernel/sched/core.c | 4 +++- kernel/sched/loadavg.c | 33 +++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf8b33dc4513..4ff03c27779e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3677,6 +3677,7 @@ static void sched_tick_remote(struct work_struct *work) if (cpu_is_offline(cpu)) goto out_unlock; + curr = rq->curr; update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -3689,10 +3690,11 @@ static void sched_tick_remote(struct work_struct *work) } curr->sched_class->task_tick(rq, curr, 0); + calc_load_nohz_remote(rq); out_unlock: rq_unlock_irq(rq, &rf); - out_requeue: + /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 28a516575c18..de22da666ac7 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_nohz_start(void) +static void calc_load_nohz_fold(struct rq *rq) { - struct rq *this_rq = this_rq(); long delta; - /* - * We're going into NO_HZ mode, if there's any pending delta, fold it - * into the pending NO_HZ delta. - */ - delta = calc_load_fold_active(this_rq, 0); + delta = calc_load_fold_active(rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -248,6 +243,24 @@ void calc_load_nohz_start(void) } } +void calc_load_nohz_start(void) +{ + /* + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. + */ + calc_load_nohz_fold(this_rq()); +} + +/* + * Keep track of the load for NOHZ_FULL, must be called between + * calc_load_nohz_{start,stop}(). + */ +void calc_load_nohz_remote(struct rq *rq) +{ + calc_load_nohz_fold(rq); +} + void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -268,7 +281,7 @@ void calc_load_nohz_stop(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_nohz_fold(void) +static long calc_load_nohz_read(void) { int idx = calc_load_read_idx(); long delta = 0; @@ -323,7 +336,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_nohz_fold(void) { return 0; } +static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks) /* * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. */ - delta = calc_load_nohz_fold(); + delta = calc_load_nohz_read(); if (delta) atomic_long_add(delta, &calc_load_tasks); -- cgit v1.2.3 From b396f52326de20ec974471b7b19168867b365cbf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 14 Jan 2020 10:13:20 +0000 Subject: sched/fair: Allow a small load imbalance between low utilisation SD_NUMA domains The CPU load balancer balances between different domains to spread load and strives to have equal balance everywhere. Communicating tasks can migrate so they are topologically close to each other but these decisions are independent. On a lightly loaded NUMA machine, two communicating tasks pulled together at wakeup time can be pushed apart by the load balancer. In isolation, the load balancer decision is fine but it ignores the tasks data locality and the wakeup/LB paths continually conflict. NUMA balancing is also a factor but it also simply conflicts with the load balancer. This patch allows a fixed degree of imbalance of two tasks to exist between NUMA domains regardless of utilisation levels. In many cases, this prevents communicating tasks being pulled apart. It was evaluated whether the imbalance should be scaled to the domain size. However, no additional benefit was measured across a range of workloads and machines and scaling adds the risk that lower domains have to be rebalanced. While this could change again in the future, such a change should specify the use case and benefit. The most obvious impact is on netperf TCP_STREAM -- two simple communicating tasks with some softirq offload depending on the transmission rate. 2-socket Haswell machine 48 core, HT enabled netperf-tcp -- mmtests config config-network-netperf-unbound baseline lbnuma-v3 Hmean 64 568.73 ( 0.00%) 577.56 * 1.55%* Hmean 128 1089.98 ( 0.00%) 1128.06 * 3.49%* Hmean 256 2061.72 ( 0.00%) 2104.39 * 2.07%* Hmean 1024 7254.27 ( 0.00%) 7557.52 * 4.18%* Hmean 2048 11729.20 ( 0.00%) 13350.67 * 13.82%* Hmean 3312 15309.08 ( 0.00%) 18058.95 * 17.96%* Hmean 4096 17338.75 ( 0.00%) 20483.66 * 18.14%* Hmean 8192 25047.12 ( 0.00%) 27806.84 * 11.02%* Hmean 16384 27359.55 ( 0.00%) 33071.88 * 20.88%* Stddev 64 2.16 ( 0.00%) 2.02 ( 6.53%) Stddev 128 2.31 ( 0.00%) 2.19 ( 5.05%) Stddev 256 11.88 ( 0.00%) 3.22 ( 72.88%) Stddev 1024 23.68 ( 0.00%) 7.24 ( 69.43%) Stddev 2048 79.46 ( 0.00%) 71.49 ( 10.03%) Stddev 3312 26.71 ( 0.00%) 57.80 (-116.41%) Stddev 4096 185.57 ( 0.00%) 96.15 ( 48.19%) Stddev 8192 245.80 ( 0.00%) 100.73 ( 59.02%) Stddev 16384 207.31 ( 0.00%) 141.65 ( 31.67%) In this case, there was a sizable improvement to performance and a general reduction in variance. However, this is not univeral. For most machines, the impact was roughly a 3% performance gain. Ops NUMA base-page range updates 19796.00 292.00 Ops NUMA PTE updates 19796.00 292.00 Ops NUMA PMD updates 0.00 0.00 Ops NUMA hint faults 16113.00 143.00 Ops NUMA hint local faults % 8407.00 142.00 Ops NUMA hint local percent 52.18 99.30 Ops NUMA pages migrated 4244.00 1.00 Without the patch, only 52.18% of sampled accesses are local. In an earlier changelog, 100% of sampled accesses are local and indeed on most machines, this was still the case. In this specific case, the local sampled rates was 99.3% but note the "base-page range updates" and "PTE updates". The activity with the patch is negligible as were the number of faults. The small number of pages migrated were related to shared libraries. A 2-socket Broadwell showed better results on average but are not presented for brevity as the performance was similar except it showed 100% of the sampled NUMA hints were local. The patch holds up for a 4-socket Haswell, an AMD EPYC and AMD Epyc 2 machine. For dbench, the impact depends on the filesystem used and the number of clients. On XFS, there is little difference as the clients typically communicate with workqueues which have a separate class of scheduler problem at the moment. For ext4, performance is generally better, particularly for small numbers of clients as NUMA balancing activity is negligible with the patch applied. A more interesting example is the Facebook schbench which uses a number of messaging threads to communicate with worker threads. In this configuration, one messaging thread is used per NUMA node and the number of worker threads is varied. The 50, 75, 90, 95, 99, 99.5 and 99.9 percentiles for response latency is then reported. Lat 50.00th-qrtle-1 44.00 ( 0.00%) 37.00 ( 15.91%) Lat 75.00th-qrtle-1 53.00 ( 0.00%) 41.00 ( 22.64%) Lat 90.00th-qrtle-1 57.00 ( 0.00%) 42.00 ( 26.32%) Lat 95.00th-qrtle-1 63.00 ( 0.00%) 43.00 ( 31.75%) Lat 99.00th-qrtle-1 76.00 ( 0.00%) 51.00 ( 32.89%) Lat 99.50th-qrtle-1 89.00 ( 0.00%) 52.00 ( 41.57%) Lat 99.90th-qrtle-1 98.00 ( 0.00%) 55.00 ( 43.88%) Lat 50.00th-qrtle-2 42.00 ( 0.00%) 42.00 ( 0.00%) Lat 75.00th-qrtle-2 48.00 ( 0.00%) 47.00 ( 2.08%) Lat 90.00th-qrtle-2 53.00 ( 0.00%) 52.00 ( 1.89%) Lat 95.00th-qrtle-2 55.00 ( 0.00%) 53.00 ( 3.64%) Lat 99.00th-qrtle-2 62.00 ( 0.00%) 60.00 ( 3.23%) Lat 99.50th-qrtle-2 63.00 ( 0.00%) 63.00 ( 0.00%) Lat 99.90th-qrtle-2 68.00 ( 0.00%) 66.00 ( 2.94% For higher worker threads, the differences become negligible but it's interesting to note the difference in wakeup latency at low utilisation and mpstat confirms that activity was almost all on one node until the number of worker threads increase. Hackbench generally showed neutral results across a range of machines. This is different to earlier versions of the patch which allowed imbalances for higher degrees of utilisation. perf bench pipe showed negligible differences in overall performance as the differences are very close to the noise. An earlier prototype of the patch showed major regressions for NAS C-class when running with only half of the available CPUs -- 20-30% performance hits were measured at the time. With this version of the patch, the impact is negligible with small gains/losses within the noise measured. This is because the number of threads far exceeds the small imbalance the aptch cares about. Similarly, there were report of regressions for the autonuma benchmark against earlier versions but again, normal load balancing now applies for that workload. In general, the patch simply seeks to avoid unnecessary cross-node migrations in the basic case where imbalances are very small. For low utilisation communicating workloads, this patch generally behaves better with less NUMA balancing activity. For high utilisation, there is no change in behaviour. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Reviewed-by: Srikar Dronamraju Acked-by: Phil Auld Tested-by: Phil Auld Link: https://lkml.kernel.org/r/20200114101319.GO3466@techsingularity.net --- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe4e0d775375..25dffc03f0f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8658,10 +8658,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * Try to use spare capacity of local group without overloading it or * emptying busiest. - * XXX Spreading tasks across NUMA nodes is not always the best policy - * and special care should be taken for SD_NUMA domain level before - * spreading the tasks. For now, load_balance() fully relies on - * NUMA_BALANCING and fbq_classify_group/rq to override the decision. */ if (local->group_type == group_has_spare) { if (busiest->group_type > group_fully_busy) { @@ -8701,16 +8697,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->migration_type = migrate_task; lsub_positive(&nr_diff, local->sum_nr_running); env->imbalance = nr_diff >> 1; - return; - } + } else { - /* - * If there is no overload, we just want to even the number of - * idle cpus. - */ - env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1); + } + + /* Consider allowing a small imbalance between NUMA groups */ + if (env->sd->flags & SD_NUMA) { + unsigned int imbalance_min; + + /* + * Compute an allowed imbalance based on a simple + * pair of communicating tasks that should remain + * local and ignore them. + * + * NOTE: Generally this would have been based on + * the domain size and this was evaluated. However, + * the benefit is similar across a range of workloads + * and machines but scaling by the domain size adds + * the risk that lower domains have to be rebalanced. + */ + imbalance_min = 2; + if (busiest->sum_nr_running <= imbalance_min) + env->imbalance = 0; + } + return; } -- cgit v1.2.3 From b562d140649966d4daedd0483a8fe59ad3bb465a Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 14 Jan 2020 21:09:47 +0000 Subject: sched/uclamp: Reject negative values in cpu_uclamp_write() The check to ensure that the new written value into cpu.uclamp.{min,max} is within range, [0:100], wasn't working because of the signed comparison 7301 if (req.percent > UCLAMP_PERCENT_SCALE) { 7302 req.ret = -ERANGE; 7303 return req; 7304 } # echo -1 > cpu.uclamp.min # cat cpu.uclamp.min 42949671.96 Cast req.percent into u64 to force the comparison to be unsigned and work as intended in capacity_from_percent(). # echo -1 > cpu.uclamp.min sh: write error: Numerical result out of range Fixes: 2480c093130f ("sched/uclamp: Extend CPU's cgroup controller") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200114210947.14083-1-qais.yousef@arm.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4ff03c27779e..55b9a9c53b91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7264,7 +7264,7 @@ capacity_from_percent(char *buf) &req.percent); if (req.ret) return req; - if (req.percent > UCLAMP_PERCENT_SCALE) { + if ((u64)req.percent > UCLAMP_PERCENT_SCALE) { req.ret = -ERANGE; return req; } -- cgit v1.2.3 From e938b9c94164e4d981039f1cf6007d7453883e5a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jan 2020 08:50:27 +0800 Subject: sched/nohz: Optimize get_nohz_timer_target() On a machine, CPU 0 is used for housekeeping, the other 39 CPUs in the same socket are in nohz_full mode. We can observe huge time burn in the loop for seaching nearest busy housekeeper cpu by ftrace. 2) | get_nohz_timer_target() { 2) 0.240 us | housekeeping_test_cpu(); 2) 0.458 us | housekeeping_test_cpu(); ... 2) 0.292 us | housekeeping_test_cpu(); 2) 0.240 us | housekeeping_test_cpu(); 2) 0.227 us | housekeeping_any_cpu(); 2) + 43.460 us | } This patch optimizes the searching logic by finding a nearest housekeeper CPU in the housekeeping cpumask, it can minimize the worst searching time from ~44us to < 10us in my testing. In addition, the last iterated busy housekeeper can become a random candidate while current CPU is a better fallback if it is a housekeeper. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/1578876627-11938-1-git-send-email-wanpengli@tencent.com --- kernel/sched/core.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55b9a9c53b91..a8a5d5b6f5cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -552,27 +552,32 @@ void resched_cpu(int cpu) */ int get_nohz_timer_target(void) { - int i, cpu = smp_processor_id(); + int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) - return cpu; + if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (!idle_cpu(cpu)) + return cpu; + default_cpu = cpu; + } rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { + for_each_cpu_and(i, sched_domain_span(sd), + housekeeping_cpumask(HK_FLAG_TIMER)) { if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { + if (!idle_cpu(i)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) - cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + if (default_cpu == -1) + default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + cpu = default_cpu; unlock: rcu_read_unlock(); return cpu; -- cgit v1.2.3 From 2a4b03ffc69f2dedc6388e9a6438b5f4c133a40d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 14 Jan 2020 15:13:56 +0100 Subject: sched/fair: Prevent unlimited runtime on throttled group When a running task is moved on a throttled task group and there is no other task enqueued on the CPU, the task can keep running using 100% CPU whatever the allocated bandwidth for the group and although its cfs rq is throttled. Furthermore, the group entity of the cfs_rq and its parents are not enqueued but only set as curr on their respective cfs_rqs. We have the following sequence: sched_move_task -dequeue_task: dequeue task and group_entities. -put_prev_task: put task and group entities. -sched_change_group: move task to new group. -enqueue_task: enqueue only task but not group entities because cfs_rq is throttled. -set_next_task : set task and group_entities as current sched_entity of their cfs_rq. Another impact is that the root cfs_rq runnable_load_avg at root rq stays null because the group_entities are not enqueued. This situation will stay the same until an "external" event triggers a reschedule. Let trigger it immediately instead. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Ben Segall Link: https://lkml.kernel.org/r/1579011236-31256-1-git-send-email-vincent.guittot@linaro.org --- kernel/sched/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a8a5d5b6f5cf..89e54f3ed571 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7072,8 +7072,15 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); - if (running) + if (running) { set_next_task(rq, tsk); + /* + * After changing group, the running task may have joined a + * throttled one but it's still the running task. Trigger a + * resched to make sure that task can still run. + */ + resched_curr(rq); + } task_rq_unlock(rq, tsk, &rf); } -- cgit v1.2.3 From 8c8c5a4994a306c217fd061cbfc5903399fd4c1c Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 10 Jan 2020 18:19:33 +0100 Subject: dma-contiguous: CMA: give precedence to cmdline Although the device tree might contain a reserved-memory DT node dedicated as the default CMA pool, users might want to change CMA's parameters using the kernel command line for debugging purposes and whatnot. Honor this by bypassing the reserved memory CMA setup, which will ultimately end up freeing the memblock and allow the command line CMA configuration routine to run. Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Phil Elwell Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index daa4e6eefdde..8bc6f2d670f9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -302,9 +302,16 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; + bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; int err; + if (size_cmdline != -1 && default_cma) { + pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n", + rmem->name); + return -EBUSY; + } + if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; @@ -322,7 +329,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) /* Architecture specific contiguous memory fixup. */ dma_contiguous_early_fixup(rmem->base, rmem->size); - if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + if (default_cma) dma_contiguous_set_default(cma); rmem->ops = &rmem_cma_ops; -- cgit v1.2.3 From 0cd9d33ace336bc424fc30944aa3defd6786e4fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 30 Jan 2020 11:37:33 -0500 Subject: cgroup: init_tasks shouldn't be linked to the root cgroup 5153faac18d2 ("cgroup: remove cgroup_enable_task_cg_lists() optimization") removed lazy initialization of css_sets so that new tasks are always lniked to its css_set. In the process, it incorrectly ended up adding init_tasks to root css_set. They show up as PID 0's in root's cgroup.procs triggering warnings in systemd and generally confusing people. Fix it by skip css_set linking for init_tasks. Signed-off-by: Tejun Heo Reported-by: https://github.com/joanbm Link: https://github.com/systemd/systemd/issues/14682 Fixes: 5153faac18d2 ("cgroup: remove cgroup_enable_task_cg_lists() optimization") Cc: stable@vger.kernel.org # v5.5+ --- kernel/cgroup/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b3744872263e..cf8a36bdf5c8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5932,11 +5932,14 @@ void cgroup_post_fork(struct task_struct *child) spin_lock_irq(&css_set_lock); - WARN_ON_ONCE(!list_empty(&child->cg_list)); - cset = task_css_set(current); /* current is @child's parent */ - get_css_set(cset); - cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); + /* init tasks are special, only link regular threads */ + if (likely(child->pid)) { + WARN_ON_ONCE(!list_empty(&child->cg_list)); + cset = task_css_set(current); /* current is @child's parent */ + get_css_set(cset); + cset->nr_tasks++; + css_set_move_task(child, NULL, cset, false); + } /* * If the cgroup has to be frozen, the new task has too. Let's set -- cgit v1.2.3 From 91ef26f914171cf753330f13724fd9142b5b1640 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 18:11:10 +0100 Subject: dma-direct: relax addressability checks in dma_direct_supported dma_direct_supported tries to find the minimum addressable bitmask based on the end pfn and optional magic that architectures can use to communicate the size of the magic ZONE_DMA that can be used for bounce buffering. But between the DMA offsets that can change per device (or sometimes even region), the fact the ZONE_DMA isn't even guaranteed to be the lowest addresses and failure of having proper interfaces to the MM code this fails at least for one arm subarchitecture. As all the legacy DMA implementations have supported 32-bit DMA masks, and 32-bit masks are guranteed to always work by the API contract (using bounce buffers if needed), we can short cut the complicated check and always return true without breaking existing assumptions. Hopefully we can properly clean up the interaction with the arch defined zones and the bootmem allocator eventually. Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs") Reported-by: Peter Ujfalusi Signed-off-by: Christoph Hellwig Tested-by: Peter Ujfalusi --- kernel/dma/direct.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 6af7ae83c4ad..32ec69cdba54 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -472,28 +472,26 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, } #endif /* CONFIG_MMU */ -/* - * Because 32-bit DMA masks are so common we expect every architecture to be - * able to satisfy them - either by not supporting more physical memory, or by - * providing a ZONE_DMA32. If neither is the case, the architecture needs to - * use an IOMMU instead of the direct mapping. - */ int dma_direct_supported(struct device *dev, u64 mask) { - u64 min_mask; - - if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(zone_dma_bits); - else - min_mask = DMA_BIT_MASK(32); + u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; - min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask >= DMA_BIT_MASK(32)) + return 1; /* * This check needs to be against the actual bit mask value, so * use __phys_to_dma() here so that the SME encryption mask isn't * part of the check. */ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); return mask >= __phys_to_dma(dev, min_mask); } -- cgit v1.2.3 From 4a47cbae04844f0c5e2365aa6c217b61850bb832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:44:38 +0100 Subject: dma-direct: improve swiotlb error reporting Untangle the way how dma_direct_map_page calls into swiotlb to be able to properly report errors where the swiotlb DMA address overflows the mask separately from overflows in the !swiotlb case. This means that siotlb_map now has to do a little more work that duplicates dma_direct_map_page, but doing so greatly simplifies the calling convention. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 16 +++++++--------- kernel/dma/swiotlb.c | 42 +++++++++++++++++++++++------------------- 2 files changed, 30 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 32ec69cdba54..594bddd04e01 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -357,13 +357,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, EXPORT_SYMBOL(dma_direct_unmap_sg); #endif -static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, - size_t size) -{ - return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size, true); -} - dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -371,8 +364,13 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(!dma_direct_possible(dev, dma_addr, size)) && - !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) { + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9280d6f8271e..c19379fabd20 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -656,35 +657,38 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } /* - * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing + * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force); + phys_addr_t swiotlb_addr; + dma_addr_t dma_addr; - if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { - dev_warn_ratelimited(dev, - "Cannot do DMA to address %pa\n", phys); - return false; - } + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, + swiotlb_force); - /* Oh well, have to allocate and map a bounce buffer. */ - *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), - *phys, size, size, dir, attrs); - if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) - return false; + swiotlb_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + paddr, size, size, dir, attrs); + if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) + return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, + dma_addr = __phys_to_dma(dev, swiotlb_addr); + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return false; + dev_WARN_ONCE(dev, 1, + "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } - return true; + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(swiotlb_addr, size, dir); + return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) -- cgit v1.2.3 From 75467ee48a5e04cf3ae3cb39aea6adee73aeff91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:54:50 +0100 Subject: dma-direct: improve DMA mask overflow reporting Remove the unset dma_mask case as that won't get into mapping calls anymore, and also report the other errors unconditonally and with a slightly improved message. Remove the now pointless report_addr helper. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 594bddd04e01..ac7956c38f69 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -23,18 +23,6 @@ */ unsigned int zone_dma_bits __ro_after_init = 24; -static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) -{ - if (!dev->dma_mask) { - dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { - dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - } - WARN_ON_ONCE(1); -} - static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { @@ -371,7 +359,9 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, if (swiotlb_force != SWIOTLB_NO_FORCE) return swiotlb_map(dev, phys, size, dir, attrs); - report_addr(dev, dma_addr, size); + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } @@ -409,7 +399,10 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, dma_addr_t dma_addr = paddr; if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - report_addr(dev, dma_addr, size); + dev_err_once(dev, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + WARN_ON_ONCE(1); return DMA_MAPPING_ERROR; } -- cgit v1.2.3 From 52262ee567ad14c9606be25f3caddcefa3c514e4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 28 Jan 2020 15:40:06 +0000 Subject: sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression The following XFS commit: 8ab39f11d974 ("xfs: prevent CIL push holdoff in log recovery") changed the logic from using bound workqueues to using unbound workqueues. Functionally this makes sense but it was observed at the time that the dbench performance dropped quite a lot and CPU migrations were increased. The current pattern of the task migration is straight-forward. With XFS, an IO issuer delegates work to xlog_cil_push_work ()on an unbound kworker. This runs on a nearby CPU and on completion, dbench wakes up on its old CPU as it is still idle and no migration occurs. dbench then queues the real IO on the blk_mq_requeue_work() work item which runs on a bound kworker which is forced to run on the same CPU as dbench. When IO completes, the bound kworker wakes dbench but as the kworker is a bound but, real task, the CPU is not considered idle and dbench gets migrated by select_idle_sibling() to a new CPU. dbench may ping-pong between two CPUs for a while but ultimately it starts a round-robin of all CPUs sharing the same LLC. High-frequency migration on each IO completion has poor performance overall. It has negative implications both in commication costs and power management. mpstat confirmed that at low thread counts that all CPUs sharing an LLC has low level of activity. Note that even if the CIL patch was reverted, there still would be migrations but the impact is less noticeable. It turns out that individually the scheduler, XFS, blk-mq and workqueues all made sensible decisions but in combination, the overall effect was sub-optimal. This patch special cases the IO issue/completion pattern and allows a bound kworker waker and a task wakee to stack on the same CPU if there is a strong chance they are directly related. The expectation is that the kworker is likely going back to sleep shortly. This is not guaranteed as the IO could be queued asynchronously but there is a very strong relationship between the task and kworker in this case that would justify stacking on the same CPU instead of migrating. There should be few concerns about kworker starvation given that the special casing is only when the kworker is the waker. DBench on XFS MMTests config: io-dbench4-async modified to run on a fresh XFS filesystem UMA machine with 8 cores sharing LLC 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack Amean 1 22.63 ( 0.00%) 20.54 * 9.23%* Amean 2 25.56 ( 0.00%) 23.40 * 8.44%* Amean 4 28.63 ( 0.00%) 27.85 * 2.70%* Amean 8 37.66 ( 0.00%) 37.68 ( -0.05%) Amean 64 469.47 ( 0.00%) 468.26 ( 0.26%) Stddev 1 1.00 ( 0.00%) 0.72 ( 28.12%) Stddev 2 1.62 ( 0.00%) 1.97 ( -21.54%) Stddev 4 2.53 ( 0.00%) 3.58 ( -41.19%) Stddev 8 5.30 ( 0.00%) 5.20 ( 1.92%) Stddev 64 86.36 ( 0.00%) 94.53 ( -9.46%) NUMA machine, 48 CPUs total, 24 CPUs share cache 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack-v1r2 Amean 1 58.69 ( 0.00%) 30.21 * 48.53%* Amean 2 60.90 ( 0.00%) 35.29 * 42.05%* Amean 4 66.77 ( 0.00%) 46.55 * 30.28%* Amean 8 81.41 ( 0.00%) 68.46 * 15.91%* Amean 16 113.29 ( 0.00%) 107.79 * 4.85%* Amean 32 199.10 ( 0.00%) 198.22 * 0.44%* Amean 64 478.99 ( 0.00%) 477.06 * 0.40%* Amean 128 1345.26 ( 0.00%) 1372.64 * -2.04%* Stddev 1 2.64 ( 0.00%) 4.17 ( -58.08%) Stddev 2 4.35 ( 0.00%) 5.38 ( -23.73%) Stddev 4 6.77 ( 0.00%) 6.56 ( 3.00%) Stddev 8 11.61 ( 0.00%) 10.91 ( 6.04%) Stddev 16 18.63 ( 0.00%) 19.19 ( -3.01%) Stddev 32 38.71 ( 0.00%) 38.30 ( 1.06%) Stddev 64 100.28 ( 0.00%) 91.24 ( 9.02%) Stddev 128 186.87 ( 0.00%) 160.34 ( 14.20%) Dbench has been modified to report the time to complete a single "load file". This is a more meaningful metric for dbench that a throughput metric as the benchmark makes many different system calls that are not throughput-related Patch shows a 9.23% and 48.53% reduction in the time to process a load file with the difference partially explained by the number of CPUs sharing a LLC. In a separate run, task migrations were almost eliminated by the patch for low client counts. In case people have issue with the metric used for the benchmark, this is a comparison of the throughputs as reported by dbench on the NUMA machine. dbench4 Throughput (misleading but traditional) 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack-v1r2 Hmean 1 321.41 ( 0.00%) 617.82 * 92.22%* Hmean 2 622.87 ( 0.00%) 1066.80 * 71.27%* Hmean 4 1134.56 ( 0.00%) 1623.74 * 43.12%* Hmean 8 1869.96 ( 0.00%) 2212.67 * 18.33%* Hmean 16 2673.11 ( 0.00%) 2806.13 * 4.98%* Hmean 32 3032.74 ( 0.00%) 3039.54 ( 0.22%) Hmean 64 2514.25 ( 0.00%) 2498.96 * -0.61%* Hmean 128 1778.49 ( 0.00%) 1746.05 * -1.82%* Note that this is somewhat specific to XFS and ext4 shows no performance difference as it does not rely on kworkers in the same way. No major problem was observed running other workloads on different machines although not all tests have completed yet. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200128154006.GD3466@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 ----------- kernel/sched/fair.c | 14 ++++++++++++++ kernel/sched/sched.h | 13 +++++++++++++ 3 files changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89e54f3ed571..1a9983da4408 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1447,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static inline bool is_per_cpu_kthread(struct task_struct *p) -{ - if (!(p->flags & PF_KTHREAD)) - return false; - - if (p->nr_cpus_allowed != 1) - return false; - - return true; -} - /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25dffc03f0f6..94c3b8469cf6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5912,6 +5912,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(prev) || sched_idle_cpu(prev))) return prev; + /* + * Allow a per-cpu kthread to stack with the wakee if the + * kworker thread and the tasks previous CPUs are the same. + * The assumption is that the wakee queued work for the + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. + */ + if (is_per_cpu_kthread(current) && + prev == smp_processor_id() && + this_rq()->nr_running <= 1) { + return prev; + } + /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1a88dc8ad11b..5876e6ba5903 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2479,3 +2479,16 @@ static inline void membarrier_switch_mm(struct rq *rq, { } #endif + +#ifdef CONFIG_SMP +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} +#endif -- cgit v1.2.3 From 10f129cb59cf6b4aee419ce4b1325fc532d975fb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 5 Feb 2020 16:34:04 -0600 Subject: tracing/kprobe: Fix uninitialized variable bug There is a potential execution path in which variable *ret* is returned without being properly initialized, previously. Fix this by initializing variable *ret* to 0. Link: http://lkml.kernel.org/r/20200205223404.GA3379@embeddedor Addresses-Coverity-ID: 1491142 ("Uninitialized scalar variable") Fixes: 2a588dd1d5d6 ("tracing: Add kprobe event command generation functions") Reviewed-by: Tom Zanussi Acked-by: Masami Hiramatsu Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 51efc790aea8..21bafd48f2ac 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1012,7 +1012,7 @@ int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...) { struct dynevent_arg arg; va_list args; - int ret; + int ret = 0; if (cmd->type != DYNEVENT_TYPE_KPROBE) return -EINVAL; -- cgit v1.2.3 From 2bf0eb9b3b0d099b20b2c4736436b666d78b94d5 Mon Sep 17 00:00:00 2001 From: Hongbo Yao Date: Mon, 10 Feb 2020 09:14:41 +0800 Subject: bpf: Make btf_check_func_type_match() static Fix the following sparse warning: kernel/bpf/btf.c:4131:5: warning: symbol 'btf_check_func_type_match' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Hongbo Yao Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200210011441.147102-1-yaohongbo@huawei.com --- kernel/bpf/btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 805c43b083e9..787140095e58 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4142,9 +4142,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, * EFAULT - verifier bug * 0 - 99% match. The last 1% is validated by the verifier. */ -int btf_check_func_type_match(struct bpf_verifier_log *log, - struct btf *btf1, const struct btf_type *t1, - struct btf *btf2, const struct btf_type *t2) +static int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) { const struct btf_param *args1, *args2; const char *fn1, *fn2, *s1, *s2; -- cgit v1.2.3 From d090409abbdd1fcbdfd6ed66612390ba8c814749 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Feb 2020 17:06:48 -0600 Subject: tracing: Add missing nest end to synth_event_trace_start() error case If the ring_buffer reserve in synth_event_trace_start() fails, the matching ring_buffer_nest_end() should be called in the error code, since nothing else will ever call it in this case. Link: http://lkml.kernel.org/r/20abc444b3eeff76425f895815380abe7aa53ff8.1581374549.git.zanussi@kernel.org Fixes: 8dcc53ad956d2 ("tracing: Add synth_event_trace() and related functions") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b3bcfd8c7332..a546ffa14785 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2043,6 +2043,7 @@ int synth_event_trace_start(struct trace_event_file *file, entry = trace_event_buffer_reserve(&trace_state->fbuffer, file, sizeof(*entry) + fields_size); if (!entry) { + ring_buffer_nest_end(trace_state->buffer); ret = -EINVAL; goto out; } -- cgit v1.2.3 From 0c62f6cd9ed320cb0ca39e33addf3a3da51b7328 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Feb 2020 17:06:49 -0600 Subject: tracing: Don't return -EINVAL when tracing soft disabled synth events There's no reason to return -EINVAL when tracing a synthetic event if it's soft disabled - treat it the same as if it were hard disabled and return normally. Have synth_event_trace() and synth_event_trace_array() just return normally, and have synth_event_trace_start set the trace state to disabled and return. Link: http://lkml.kernel.org/r/df5d02a1625aff97c9866506c5bada6a069982ba.1581374549.git.zanussi@kernel.org Fixes: 8dcc53ad956d2 ("tracing: Add synth_event_trace() and related functions") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a546ffa14785..99a02168599b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1828,7 +1828,8 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) * called directly by the user, we don't have that but we * still need to honor not logging when disabled. */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) return 0; event = file->event_call->data; @@ -1836,9 +1837,6 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) if (n_vals != event->n_fields) return -EINVAL; - if (trace_trigger_soft_disabled(file)) - return -EINVAL; - fields_size = event->n_u64 * sizeof(u64); /* @@ -1918,7 +1916,8 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, * called directly by the user, we don't have that but we * still need to honor not logging when disabled. */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) return 0; event = file->event_call->data; @@ -1926,9 +1925,6 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, if (n_vals != event->n_fields) return -EINVAL; - if (trace_trigger_soft_disabled(file)) - return -EINVAL; - fields_size = event->n_u64 * sizeof(u64); /* @@ -2017,7 +2013,8 @@ int synth_event_trace_start(struct trace_event_file *file, * trace case, we save the enabed state upon start and just * ignore the following data calls. */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) { + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { trace_state->enabled = false; goto out; } @@ -2026,11 +2023,6 @@ int synth_event_trace_start(struct trace_event_file *file, trace_state->event = file->event_call->data; - if (trace_trigger_soft_disabled(file)) { - ret = -EINVAL; - goto out; - } - fields_size = trace_state->event->n_u64 * sizeof(u64); /* -- cgit v1.2.3 From 7276531d4036f5db2af15c8b6caa02e7741f5d80 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Feb 2020 17:06:50 -0600 Subject: tracing: Consolidate trace() functions Move the checking, buffer reserve and buffer commit code in synth_event_trace_start/end() into inline functions __synth_event_trace_start/end() so they can also be used by synth_event_trace() and synth_event_trace_array(), and then have all those functions use them. Also, change synth_event_trace_state.enabled to disabled so it only needs to be set if the event is disabled, which is not normally the case. Link: http://lkml.kernel.org/r/b1f3108d0f450e58192955a300e31d0405ab4149.1581374549.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 220 +++++++++++++++------------------------ 1 file changed, 86 insertions(+), 134 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 99a02168599b..65b54d6a1422 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1791,6 +1791,60 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) } EXPORT_SYMBOL_GPL(synth_event_cmd_init); +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int entry_size, fields_size = 0; + int ret = 0; + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the the iterated + * trace case, we save the enabed state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { + trace_state->disabled = true; + ret = -ENOENT; + goto out; + } + + trace_state->event = file->event_call->data; + + fields_size = trace_state->event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry_size = sizeof(*trace_state->entry) + fields_size; + trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, + file, + entry_size); + if (!trace_state->entry) { + ring_buffer_nest_end(trace_state->buffer); + ret = -EINVAL; + } +out: + return ret; +} + +static inline void +__synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); +} + /** * synth_event_trace - Trace a synthetic event * @file: The trace_event_file representing the synthetic event @@ -1812,69 +1866,38 @@ EXPORT_SYMBOL_GPL(synth_event_cmd_init); */ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) { - struct trace_event_buffer fbuffer; - struct synth_trace_event *entry; - struct trace_buffer *buffer; - struct synth_event *event; + struct synth_event_trace_state state; unsigned int i, n_u64; - int fields_size = 0; va_list args; - int ret = 0; - - /* - * Normal event generation doesn't get called at all unless - * the ENABLED bit is set (which attaches the probe thus - * allowing this code to be called, etc). Because this is - * called directly by the user, we don't have that but we - * still need to honor not logging when disabled. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) - return 0; - - event = file->event_call->data; - - if (n_vals != event->n_fields) - return -EINVAL; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + int ret; - entry = trace_event_buffer_reserve(&fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ret = -EINVAL; - goto out; + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; } va_start(args, n_vals); - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; val = va_arg(args, u64); - if (event->fields[i]->is_string) { + if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)val; - char *str_field = (char *)&entry->fields[n_u64]; + char *str_field = (char *)&state.entry->fields[n_u64]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = val; + state.entry->fields[n_u64] = val; n_u64++; } } va_end(args); - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); + __synth_event_trace_end(&state); return ret; } @@ -1901,62 +1924,31 @@ EXPORT_SYMBOL_GPL(synth_event_trace); int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals) { - struct trace_event_buffer fbuffer; - struct synth_trace_event *entry; - struct trace_buffer *buffer; - struct synth_event *event; + struct synth_event_trace_state state; unsigned int i, n_u64; - int fields_size = 0; - int ret = 0; - - /* - * Normal event generation doesn't get called at all unless - * the ENABLED bit is set (which attaches the probe thus - * allowing this code to be called, etc). Because this is - * called directly by the user, we don't have that but we - * still need to honor not logging when disabled. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) - return 0; - - event = file->event_call->data; - - if (n_vals != event->n_fields) - return -EINVAL; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + int ret; - entry = trace_event_buffer_reserve(&fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ret = -EINVAL; - goto out; + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; } - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - if (event->fields[i]->is_string) { + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&entry->fields[n_u64]; + char *str_field = (char *)&state.entry->fields[n_u64]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = vals[i]; + state.entry->fields[n_u64] = vals[i]; n_u64++; } } - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); + __synth_event_trace_end(&state); return ret; } @@ -1993,55 +1985,17 @@ EXPORT_SYMBOL_GPL(synth_event_trace_array); int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state) { - struct synth_trace_event *entry; - int fields_size = 0; - int ret = 0; + int ret; - if (!trace_state) { - ret = -EINVAL; - goto out; - } + if (!trace_state) + return -EINVAL; memset(trace_state, '\0', sizeof(*trace_state)); - /* - * Normal event tracing doesn't get called at all unless the - * ENABLED bit is set (which attaches the probe thus allowing - * this code to be called, etc). Because this is called - * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated - * trace case, we save the enabed state upon start and just - * ignore the following data calls. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) { - trace_state->enabled = false; - goto out; - } - - trace_state->enabled = true; + ret = __synth_event_trace_start(file, trace_state); + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ - trace_state->event = file->event_call->data; - - fields_size = trace_state->event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - trace_state->buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(trace_state->buffer); - - entry = trace_event_buffer_reserve(&trace_state->fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ring_buffer_nest_end(trace_state->buffer); - ret = -EINVAL; - goto out; - } - - trace_state->entry = entry; -out: return ret; } EXPORT_SYMBOL_GPL(synth_event_trace_start); @@ -2074,7 +2028,7 @@ static int __synth_event_add_val(const char *field_name, u64 val, trace_state->add_next = true; } - if (!trace_state->enabled) + if (trace_state->disabled) goto out; event = trace_state->event; @@ -2209,9 +2163,7 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state) if (!trace_state) return -EINVAL; - trace_event_buffer_commit(&trace_state->fbuffer); - - ring_buffer_nest_end(trace_state->buffer); + __synth_event_trace_end(trace_state); return 0; } -- cgit v1.2.3 From e3728b50cd9be7d4b1469447cdf1feb93e3b7adb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Feb 2020 10:11:02 +0100 Subject: ACPI: PM: s2idle: Avoid possible race related to the EC GPE It is theoretically possible for the ACPI EC GPE to be set after the s2idle_ops->wake() called from s2idle_loop() has returned and before the subsequent pm_wakeup_pending() check is carried out. If that happens, the resulting wakeup event will cause the system to resume even though it may be a spurious one. To avoid that race, first make the ->wake() callback in struct platform_s2idle_ops return a bool value indicating whether or not to let the system resume and rearrange s2idle_loop() to use that value instad of the direct pm_wakeup_pending() call if ->wake() is present. Next, rework acpi_s2idle_wake() to process EC events and check pm_wakeup_pending() before re-arming the SCI for system wakeup to prevent it from triggering prematurely and add comments to that function to explain the rationale for the new code flow. Fixes: 56b991849009 ("PM: sleep: Simplify suspend-to-idle control flow") Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 2c47280fbfc7..8b1bb5ee7e5d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -131,11 +131,12 @@ static void s2idle_loop(void) * to avoid them upfront. */ for (;;) { - if (s2idle_ops && s2idle_ops->wake) - s2idle_ops->wake(); - - if (pm_wakeup_pending()) + if (s2idle_ops && s2idle_ops->wake) { + if (s2idle_ops->wake()) + break; + } else if (pm_wakeup_pending()) { break; + } pm_wakeup_clear(false); -- cgit v1.2.3 From 6fcca0fa48118e6d63733eb4644c6cd880c15b8f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 3 Feb 2020 13:22:16 -0800 Subject: sched/psi: Fix OOB write when writing 0 bytes to PSI files Issuing write() with count parameter set to 0 on any file under /proc/pressure/ will cause an OOB write because of the access to buf[buf_size-1] when NUL-termination is performed. Fix this by checking for buf_size to be non-zero. Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20200203212216.7076-1-surenb@google.com --- kernel/sched/psi.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index db7b50bba3f1..38ccd49b9bf6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; + if (!nbytes) + return -EINVAL; + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; -- cgit v1.2.3 From 4104a562e0ca62e971089db9d3c47794a0d7d4eb Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sat, 1 Feb 2020 18:28:03 +0530 Subject: sched/core: Annotate curr pointer in rq with __rcu This patch fixes the following sparse warnings in sched/core.c and sched/membarrier.c: kernel/sched/core.c:2372:27: error: incompatible types in comparison expression kernel/sched/core.c:4061:17: error: incompatible types in comparison expression kernel/sched/core.c:6067:9: error: incompatible types in comparison expression kernel/sched/membarrier.c:108:21: error: incompatible types in comparison expression kernel/sched/membarrier.c:177:21: error: incompatible types in comparison expression kernel/sched/membarrier.c:243:21: error: incompatible types in comparison expression Signed-off-by: Madhuparna Bhowmik Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200201125803.20245-1-madhuparnabhowmik10@gmail.com --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5876e6ba5903..9ea647835fd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -896,7 +896,7 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr; + struct task_struct __rcu *curr; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; -- cgit v1.2.3 From e9f5490c3574b435ce7fe7a71724aa3866babc7f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 9 Feb 2020 19:29:12 -0800 Subject: sched/fair: Fix kernel-doc warning in attach_entity_load_avg() Fix kernel-doc warning in kernel/sched/fair.c, caused by a recent function parameter removal: ../kernel/sched/fair.c:3526: warning: Excess function parameter 'flags' description in 'attach_entity_load_avg' Fixes: a4f9a0e51bbf ("sched/fair: Remove redundant call to cpufreq_update_util()") Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/cbe964e4-6879-fd08-41c9-ef1917414af4@infradead.org --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94c3b8469cf6..3c8a379c357e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach - * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. -- cgit v1.2.3 From cba6437a1854fde5934098ec3bd0ee83af3129f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Feb 2020 12:19:41 +0100 Subject: genirq/proc: Reject invalid affinity masks (again) Qian Cai reported that the WARN_ON() in the x86/msi affinity setting code, which catches cases where the affinity setting is not done on the CPU which is the current target of the interrupt, triggers during CPU hotplug stress testing. It turns out that the warning which was added with the commit addressing the MSI affinity race unearthed yet another long standing bug. If user space writes a bogus affinity mask, i.e. it contains no online CPUs, then it calls irq_select_affinity_usr(). This was introduced for ALPHA in eee45269b0f5 ("[PATCH] Alpha: convert to generic irq framework (generic part)") and subsequently made available for all architectures in 18404756765c ("genirq: Expose default irq affinity mask (take 3)") which introduced the circumvention of the affinity setting restrictions for interrupt which cannot be moved in process context. The whole exercise is bogus in various aspects: 1) If the interrupt is already started up then there is absolutely no point to honour a bogus interrupt affinity setting from user space. The interrupt is already assigned to an online CPU and it does not make any sense to reassign it to some other randomly chosen online CPU. 2) If the interupt is not yet started up then there is no point either. A subsequent startup of the interrupt will invoke irq_setup_affinity() anyway which will chose a valid target CPU. So the only correct solution is to just return -EINVAL in case user space wrote an affinity mask which does not contain any online CPUs, except for ALPHA which has it's own magic sauce for this. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Reported-by: Qian Cai Signed-off-by: Thomas Gleixner Tested-by: Qian Cai Link: https://lkml.kernel.org/r/878sl8xdbm.fsf@nanos.tec.linutronix.de --- kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 18 ++---------------- kernel/irq/proc.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 3924fbe829d4..c9d8eb7f5c02 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern int irq_select_affinity_usr(unsigned int irq); - extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3089a60ea8f9..7eee98c38f25 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -481,23 +481,9 @@ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } -#endif +#endif /* CONFIG_AUTO_IRQ_AFFINITY */ +#endif /* CONFIG_SMP */ -/* - * Called when a bogus affinity is set via /proc/irq - */ -int irq_select_affinity_usr(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = irq_setup_affinity(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} -#endif /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9e5783d98033..32c071d7bc03 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -111,6 +111,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) return show_irq_affinity(AFFINITY_LIST, m); } +#ifndef CONFIG_AUTO_IRQ_AFFINITY +static inline int irq_select_affinity_usr(unsigned int irq) +{ + /* + * If the interrupt is started up already then this fails. The + * interrupt is assigned to an online CPU already. There is no + * point to move it around randomly. Tell user space that the + * selected mask is bogus. + * + * If not then any change to the affinity is pointless because the + * startup code invokes irq_setup_affinity() which will select + * a online CPU anyway. + */ + return -EINVAL; +} +#else +/* ALPHA magic affinity auto selector. Keep it for historical reasons. */ +static inline int irq_select_affinity_usr(unsigned int irq) +{ + return irq_select_affinity(irq); +} +#endif static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) -- cgit v1.2.3 From e20d3a055a457a10a4c748ce5b7c2ed3173a1324 Mon Sep 17 00:00:00 2001 From: Johannes Krude Date: Wed, 12 Feb 2020 20:32:27 +0100 Subject: bpf, offload: Replace bitwise AND by logical AND in bpf_prog_offload_info_fill This if guards whether user-space wants a copy of the offload-jited bytecode and whether this bytecode exists. By erroneously doing a bitwise AND instead of a logical AND on user- and kernel-space buffer-size can lead to no data being copied to user-space especially when user-space size is a power of two and bigger then the kernel-space buffer. Fixes: fcfb126defda ("bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info") Signed-off-by: Johannes Krude Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20200212193227.GA3769@phlox.h.transitiv.net --- kernel/bpf/offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2c5dc6541ece..bd09290e3648 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; - if (info->jited_prog_len & ulen) { + if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { -- cgit v1.2.3 From 140588bfed2727e9a4814211617a79401d74922f Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 14 Feb 2020 18:26:28 +0100 Subject: s390: remove obsolete ieee_emulation_warnings s390 math emulation was removed with commit 5a79859ae0f3 ("s390: remove 31 bit support"), rendering ieee_emulation_warnings useless. The code still built because it was protected by CONFIG_MATHEMU, which was no longer selectable. This patch removes the sysctl_ieee_emulation_warnings declaration and the sysctl entry declaration. Link: https://lkml.kernel.org/r/20200214172628.3598516-1-steve@sk2.org Reviewed-by: Vasily Gorbik Signed-off-by: Stephen Kitt Signed-off-by: Vasily Gorbik --- kernel/sysctl.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d396aaaf19a3..ad5b88a53c5a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -805,15 +805,6 @@ static struct ctl_table kern_table[] = { .extra2 = &maxolduid, }, #ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU - { - .procname = "ieee_emulation_warnings", - .data = &sysctl_ieee_emulation_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "userprocess_debug", .data = &show_unhandled_signals, -- cgit v1.2.3 From 492e0d0d6f2eb4badfd2868addf9da0f651eba0e Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Tue, 18 Feb 2020 09:25:52 -0800 Subject: bpf: Do not grab the bucket spinlock by default on htab batch ops Grabbing the spinlock for every bucket even if it's empty, was causing significant perfomance cost when traversing htab maps that have only a few entries. This patch addresses the issue by checking first the bucket_cnt, if the bucket has some entries then we go and grab the spinlock and proceed with the batching. Tested with a htab of size 50K and different value of populated entries. Before: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 2759655 2752033 BM_DumpHashMap/10 2933722 2930825 BM_DumpHashMap/200 3171680 3170265 BM_DumpHashMap/500 3639607 3635511 BM_DumpHashMap/1000 4369008 4364981 BM_DumpHashMap/5k 11171919 11134028 BM_DumpHashMap/20k 69150080 69033496 BM_DumpHashMap/39k 190501036 190226162 After: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 202707 200109 BM_DumpHashMap/10 213441 210569 BM_DumpHashMap/200 478641 472350 BM_DumpHashMap/500 980061 967102 BM_DumpHashMap/1000 1863835 1839575 BM_DumpHashMap/5k 8961836 8902540 BM_DumpHashMap/20k 69761497 69322756 BM_DumpHashMap/39k 187437830 186551111 Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200218172552.215077-1-brianvv@google.com --- kernel/bpf/hashtab.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2d182c4ee9d9..9194479a2fa7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1259,7 +1259,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; - unsigned long flags; + unsigned long flags = 0; + bool locked = false; struct htab_elem *l; struct bucket *b; int ret = 0; @@ -1319,15 +1320,25 @@ again_nocopy: dst_val = values; b = &htab->buckets[batch]; head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + /* do not grab the lock unless need it (bucket_cnt > 0). */ + if (locked) + raw_spin_lock_irqsave(&b->lock, flags); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) bucket_cnt++; + if (bucket_cnt && !locked) { + locked = true; + goto again_nocopy; + } + if (bucket_cnt > (max_count - total)) { if (total == 0) ret = -ENOSPC; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1337,6 +1348,9 @@ again_nocopy: if (bucket_cnt > bucket_size) { bucket_size = bucket_cnt; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1346,6 +1360,10 @@ again_nocopy: goto alloc; } + /* Next block is only safe to run if you have grabbed the lock */ + if (!locked) + goto next_batch; + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { memcpy(dst_key, l->key, key_size); @@ -1380,6 +1398,8 @@ again_nocopy: } raw_spin_unlock_irqrestore(&b->lock, flags); + locked = false; +next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. */ -- cgit v1.2.3 From b9aff38de2cb166476988020428985c5f7412ffc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 19 Feb 2020 15:47:57 -0800 Subject: bpf: Fix a potential deadlock with bpf_map_do_batch Commit 057996380a42 ("bpf: Add batch ops to all htab bpf map") added lookup_and_delete batch operation for hash table. The current implementation has bpf_lru_push_free() inside the bucket lock, which may cause a deadlock. syzbot reports: -> #2 (&htab->buckets[i].lock#2){....}: __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 htab_lru_map_delete_node+0xce/0x2f0 kernel/bpf/hashtab.c:593 __bpf_lru_list_shrink_inactive kernel/bpf/bpf_lru_list.c:220 [inline] __bpf_lru_list_shrink+0xf9/0x470 kernel/bpf/bpf_lru_list.c:266 bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:340 [inline] bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline] bpf_lru_pop_free+0x87c/0x1670 kernel/bpf/bpf_lru_list.c:499 prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132 __htab_lru_percpu_map_update_elem+0x67e/0xa90 kernel/bpf/hashtab.c:1069 bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585 bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181 generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&loc_l->lock){....}: check_prev_add kernel/locking/lockdep.c:2475 [inline] check_prevs_add kernel/locking/lockdep.c:2580 [inline] validate_chain kernel/locking/lockdep.c:2970 [inline] __lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954 lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline] bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555 __htab_map_lookup_and_delete_batch+0x8d4/0x1540 kernel/bpf/hashtab.c:1374 htab_lru_map_lookup_and_delete_batch+0x34/0x40 kernel/bpf/hashtab.c:1491 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe Possible unsafe locking scenario: CPU0 CPU2 ---- ---- lock(&htab->buckets[i].lock#2); lock(&l->lock); lock(&htab->buckets[i].lock#2); lock(&loc_l->lock); *** DEADLOCK *** To fix the issue, for htab_lru_map_lookup_and_delete_batch() in CPU0, let us do bpf_lru_push_free() out of the htab bucket lock. This can avoid the above deadlock scenario. Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Reported-by: syzbot+a38ff3d9356388f2fb83@syzkaller.appspotmail.com Reported-by: syzbot+122b5421d14e68f29cd1@syzkaller.appspotmail.com Suggested-by: Hillf Danton Suggested-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Acked-by: Brian Vazquez Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200219234757.3544014-1-yhs@fb.com --- kernel/bpf/hashtab.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9194479a2fa7..a1468e3f5af2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -56,6 +56,7 @@ struct htab_elem { union { struct bpf_htab *htab; struct pcpu_freelist_node fnode; + struct htab_elem *batch_flink; }; }; }; @@ -126,6 +127,17 @@ free_elems: bpf_map_area_free(htab->elems); } +/* The LRU list has a lock (lru_lock). Each htab bucket has a lock + * (bucket_lock). If both locks need to be acquired together, the lock + * order is always lru_lock -> bucket_lock and this only happens in + * bpf_lru_list.c logic. For example, certain code path of + * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), + * will acquire lru_lock first followed by acquiring bucket_lock. + * + * In hashtab.c, to avoid deadlock, lock acquisition of + * bucket_lock followed by lru_lock is not allowed. In such cases, + * bucket_lock needs to be released first before acquiring lru_lock. + */ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { @@ -1256,6 +1268,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; + struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; @@ -1388,10 +1401,18 @@ again_nocopy: } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); - if (is_lru_map) - bpf_lru_push_free(&htab->lru, &l->lru_node); - else + + /* bpf_lru_push_free() will acquire lru_lock, which + * may cause deadlock. See comments in function + * prealloc_lru_pop(). Let us do bpf_lru_push_free() + * after releasing the bucket lock. + */ + if (is_lru_map) { + l->batch_flink = node_to_free; + node_to_free = l; + } else { free_htab_elem(htab, l); + } } dst_key += key_size; dst_val += value_size; @@ -1399,6 +1420,13 @@ again_nocopy: raw_spin_unlock_irqrestore(&b->lock, flags); locked = false; + + while (node_to_free) { + l = node_to_free; + node_to_free = node_to_free->batch_flink; + bpf_lru_push_free(&htab->lru, &l->lru_node); + } + next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. -- cgit v1.2.3 From b0c609ab2057d0953fa05e7566f0c0e8a28fa9e1 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 14 Feb 2020 15:06:21 +0100 Subject: PM / hibernate: fix typo "reserverd_size" -> "reserved_size" Fix a mistake in a variable name in a comment. Signed-off-by: Alexandre Belloni Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ddade80ad276..d82b7b88d616 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1681,7 +1681,7 @@ static unsigned long minimum_image_size(unsigned long saveable) * hibernation for allocations made while saving the image and for device * drivers, in case they need to allocate memory from their hibernation * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough - * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * estimate) and reserved_size divided by PAGE_SIZE (which is tunable through * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * -- cgit v1.2.3 From 279eef0531928a6669230879a6eed081513ad5a3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:38 -0600 Subject: tracing: Make sure synth_event_trace() example always uses u64 synth_event_trace() is the varargs version of synth_event_trace_array(), which takes an array of u64, as do synth_event_add_val() et al. To not only be consistent with those, but also to address the fact that synth_event_trace() expects every arg to be of the same type since it doesn't also pass in e.g. a format string, the caller needs to make sure all args are of the same type, u64. u64 is used because it needs to accomodate the largest type available in synthetic events, which is u64. This fixes the bug reported by the kernel test robot/Rong Chen. Link: https://lore.kernel.org/lkml/20200212113444.GS12867@shao2-debian/ Link: http://lkml.kernel.org/r/894c4e955558b521210ee0642ba194a9e603354c.1581720155.git.zanussi@kernel.org Fixes: 9fe41efaca084 ("tracing: Add synth event generation test module") Reported-by: kernel test robot Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 4aefe003cb7c..6866280a9b10 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -111,11 +111,11 @@ static int __init test_gen_synth_cmd(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"hula hoops"; /* next_comm_field */ + vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ /* Now generate a gen_synth_test event */ @@ -218,11 +218,11 @@ static int __init test_empty_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed_2.0"; /* my_string_field */ + vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ /* Now trace an empty_synth_test event */ @@ -290,11 +290,11 @@ static int __init test_create_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ /* Now generate a create_synth_test event */ @@ -330,7 +330,7 @@ static int __init test_add_next_synth_val(void) goto out; /* next_comm_field */ - ret = synth_event_add_next_val((u64)"slinky", &trace_state); + ret = synth_event_add_next_val((u64)(long)"slinky", &trace_state); if (ret) goto out; @@ -350,7 +350,7 @@ static int __init test_add_next_synth_val(void) goto out; /* my_string_field */ - ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state); + ret = synth_event_add_next_val((u64)(long)"thneed_2.01", &trace_state); if (ret) goto out; @@ -396,12 +396,12 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("next_comm_field", (u64)"silly putty", + ret = synth_event_add_val("next_comm_field", (u64)(long)"silly putty", &trace_state); if (ret) goto out; - ret = synth_event_add_val("my_string_field", (u64)"thneed_9", + ret = synth_event_add_val("my_string_field", (u64)(long)"thneed_9", &trace_state); if (ret) goto out; @@ -423,13 +423,13 @@ static int __init test_trace_synth_event(void) /* Trace some bogus values just for testing */ ret = synth_event_trace(create_synth_test, 7, /* number of values */ - 444, /* next_pid_field */ - (u64)"clackers", /* next_comm_field */ - 1000000, /* ts_ns */ - 1000, /* ts_ms */ - smp_processor_id(), /* cpu */ - (u64)"Thneed", /* my_string_field */ - 999); /* my_int_field */ + (u64)444, /* next_pid_field */ + (u64)(long)"clackers", /* next_comm_field */ + (u64)1000000, /* ts_ns */ + (u64)1000, /* ts_ms */ + (u64)smp_processor_id(),/* cpu */ + (u64)(long)"Thneed", /* my_string_field */ + (u64)999); /* my_int_field */ return ret; } -- cgit v1.2.3 From 1d9d4c90194a8c3b2f7da9f4bf3f8ba2ed810656 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:39 -0600 Subject: tracing: Make synth_event trace functions endian-correct synth_event_trace(), synth_event_trace_array() and __synth_event_add_val() write directly into the trace buffer and need to take endianness into account, like trace_event_raw_event_synth() does. Link: http://lkml.kernel.org/r/2011354355e405af9c9d28abba430d1f5ff7771a.1581720155.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 62 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 65b54d6a1422..6a380fb83864 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1891,7 +1891,25 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = val; + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } @@ -1943,7 +1961,26 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = vals[i]; + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } @@ -2062,8 +2099,25 @@ static int __synth_event_add_val(const char *field_name, u64 val, str_field = (char *)&entry->fields[field->offset]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else - entry->fields[field->offset] = val; + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } out: return ret; } -- cgit v1.2.3 From 3843083772dc2afde790a6d7160658b00a808da1 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:40 -0600 Subject: tracing: Check that number of vals matches number of synth event fields Commit 7276531d4036('tracing: Consolidate trace() functions') inadvertently dropped the synth_event_trace() and synth_event_trace_array() checks that verify the number of values passed in matches the number of fields in the synthetic event being traced, so add them back. Link: http://lkml.kernel.org/r/32819cac708714693669e0dfe10fe9d935e94a16.1581720155.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6a380fb83864..45622194a34d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1878,6 +1878,11 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + va_start(args, n_vals); for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; @@ -1914,7 +1919,7 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) } } va_end(args); - +out: __synth_event_trace_end(&state); return ret; @@ -1953,6 +1958,11 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; @@ -1984,7 +1994,7 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, n_u64++; } } - +out: __synth_event_trace_end(&state); return ret; -- cgit v1.2.3 From 784bd0847eda032ed2f3522f87250655a18c0190 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:41 -0600 Subject: tracing: Fix number printing bug in print_synth_event() Fix a varargs-related bug in print_synth_event() which resulted in strange output and oopses on 32-bit x86 systems. The problem is that trace_seq_printf() expects the varargs to match the format string, but print_synth_event() was always passing u64 values regardless. This results in unspecified behavior when unpacking with va_arg() in trace_seq_printf(). Add a function that takes the size into account when calling trace_seq_printf(). Before: modprobe-1731 [003] .... 919.039758: gen_synth_test: next_pid_field=777(null)next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=3(null)my_string_field=thneed my_int_field=598(null) After: insmod-1136 [001] .... 36.634590: gen_synth_test: next_pid_field=777 next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=1 my_string_field=thneed my_int_field=598 Link: http://lkml.kernel.org/r/a9b59eb515dbbd7d4abe53b347dccf7a8e285657.1581720155.git.zanussi@kernel.org Reported-by: Steven Rostedt (VMware) Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 45622194a34d..f068d55bd37f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -820,6 +820,29 @@ static const char *synth_field_fmt(char *type) return fmt; } +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + static enum print_line_t print_synth_event(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -858,10 +881,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); - trace_seq_printf(s, print_fmt, se->fields[i]->name, - entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); -- cgit v1.2.3 From 3c18a9be7c9d4f53239795282c5d927f73f534b3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Feb 2020 16:29:50 -0500 Subject: tracing: Have synthetic event test use raw_smp_processor_id() The test code that tests synthetic event creation pushes in as one of its test fields the current CPU using "smp_processor_id()". As this is just something to see if the value is correctly passed in, and the actual CPU used does not matter, use raw_smp_processor_id(), otherwise with debug preemption enabled, a warning happens as the smp_processor_id() is called without preemption enabled. Link: http://lkml.kernel.org/r/20200220162950.35162579@gandalf.local.home Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 6866280a9b10..7d56d621ffea 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -114,7 +114,7 @@ static int __init test_gen_synth_cmd(void) vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ @@ -221,7 +221,7 @@ static int __init test_empty_synth_event(void) vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ @@ -293,7 +293,7 @@ static int __init test_create_synth_event(void) vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ @@ -345,7 +345,7 @@ static int __init test_add_next_synth_val(void) goto out; /* cpu */ - ret = synth_event_add_next_val(smp_processor_id(), &trace_state); + ret = synth_event_add_next_val(raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -388,7 +388,7 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state); + ret = synth_event_add_val("cpu", raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -427,7 +427,7 @@ static int __init test_trace_synth_event(void) (u64)(long)"clackers", /* next_comm_field */ (u64)1000000, /* ts_ns */ (u64)1000, /* ts_ms */ - (u64)smp_processor_id(),/* cpu */ + (u64)raw_smp_processor_id(), /* cpu */ (u64)(long)"Thneed", /* my_string_field */ (u64)999); /* my_int_field */ return ret; -- cgit v1.2.3 From 78041c0c9e935d9ce4086feeff6c569ed88ddfd4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Feb 2020 15:38:01 -0500 Subject: tracing: Disable trace_printk() on post poned tests The tracing seftests checks various aspects of the tracing infrastructure, and one is filtering. If trace_printk() is active during a self test, it can cause the filtering to fail, which will disable that part of the trace. To keep the selftests from failing because of trace_printk() calls, trace_printk() checks the variable tracing_selftest_running, and if set, it does not write to the tracing buffer. As some tracers were registered earlier in boot, the selftest they triggered would fail because not all the infrastructure was set up for the full selftest. Thus, some of the tests were post poned to when their infrastructure was ready (namely file system code). The postpone code did not set the tracing_seftest_running variable, and could fail if a trace_printk() was added and executed during their run. Cc: stable@vger.kernel.org Fixes: 9afecfbb95198 ("tracing: Postpone tracer start-up tests till the system is more robust") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 183b031a3828..a89c562ffb8f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1837,6 +1837,7 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); + tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. @@ -1859,6 +1860,7 @@ static __init int init_trace_selftests(void) list_del(&p->list); kfree(p); } + tracing_selftest_running = false; out: mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 7ab215f22d04067094de8c81c20ba4c565ff8dd4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 17 Feb 2020 18:52:39 +0900 Subject: tracing: Clear trace_state when starting trace Clear trace_state data structure when starting trace in __synth_event_trace_start() internal function. Currently trace_state is initialized only in the synth_event_trace_start() API, but the trace_state in synth_event_trace() and synth_event_trace_array() are on the stack without initialization. This means those APIs will see wrong parameters and wil skip closing process in __synth_event_trace_end() because trace_state->disabled may be !0. Link: http://lkml.kernel.org/r/158193315899.8868.1781259176894639952.stgit@devnote2 Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f068d55bd37f..9d87aa1f0b79 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1824,6 +1824,8 @@ __synth_event_trace_start(struct trace_event_file *file, int entry_size, fields_size = 0; int ret = 0; + memset(trace_state, '\0', sizeof(*trace_state)); + /* * Normal event tracing doesn't get called at all unless the * ENABLED bit is set (which attaches the probe thus allowing @@ -2063,8 +2065,6 @@ int synth_event_trace_start(struct trace_event_file *file, if (!trace_state) return -EINVAL; - memset(trace_state, '\0', sizeof(*trace_state)); - ret = __synth_event_trace_start(file, trace_state); if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ -- cgit v1.2.3 From d8a953ddde5ec30a36810d0a892c3949b50849e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Feb 2020 21:18:33 +0900 Subject: bootconfig: Set CONFIG_BOOT_CONFIG=n by default Set CONFIG_BOOT_CONFIG=n by default. This also warns user if CONFIG_BOOT_CONFIG=n but "bootconfig" is given in the kernel command line. Link: http://lkml.kernel.org/r/158220111291.26565.9036889083940367969.stgit@devnote2 Suggested-by: Steven Rostedt Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 91e885194dbc..795c3e02d3f1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -143,7 +143,8 @@ if FTRACE config BOOTTIME_TRACING bool "Boot-time Tracing support" - depends on BOOT_CONFIG && TRACING + depends on TRACING + select BOOT_CONFIG default y help Enable developer to setup ftrace subsystem via supplemental -- cgit v1.2.3 From 412c53a680a97cb1ae2c0ab60230e193bee86387 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:54 -0800 Subject: y2038: remove unused time32 interfaces No users remain, so kill these off before we grow new ones. Link: http://lkml.kernel.org/r/20200110154232.4104492-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 64 ------------------------------------------------------ kernel/time/time.c | 43 ------------------------------------ 2 files changed, 107 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 95005f849c68..843dd17e6078 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,70 +26,6 @@ #include -static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __get_user(tv->tv_sec, &ctv->tv_sec) || - __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __put_user(tv->tv_sec, &ctv->tv_sec) || - __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int compat_get_timeval(struct timeval *tv, const void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_get_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_get_timeval); - -int compat_put_timeval(const struct timeval *tv, void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_put_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_put_timeval); - -int compat_get_timespec(struct timespec *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec); - -int compat_put_timespec(const struct timespec *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec); - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/time/time.c b/kernel/time/time.c index cdd7386115ff..3985b2b32d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); -- cgit v1.2.3 From 2ad3e17ebf94b7b7f3f64c050ff168f9915345eb Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sat, 22 Feb 2020 20:36:47 -0500 Subject: audit: fix error handling in audit_data_to_entry() Commit 219ca39427bf ("audit: use union for audit_field values since they are mutually exclusive") combined a number of separate fields in the audit_field struct into a single union. Generally this worked just fine because they are generally mutually exclusive. Unfortunately in audit_data_to_entry() the overlap can be a problem when a specific error case is triggered that causes the error path code to attempt to cleanup an audit_field struct and the cleanup involves attempting to free a stored LSM string (the lsm_str field). Currently the code always has a non-NULL value in the audit_field.lsm_str field as the top of the for-loop transfers a value into audit_field.val (both .lsm_str and .val are part of the same union); if audit_data_to_entry() fails and the audit_field struct is specified to contain a LSM string, but the audit_field.lsm_str has not yet been properly set, the error handling code will attempt to free the bogus audit_field.lsm_str value that was set with audit_field.val at the top of the for-loop. This patch corrects this by ensuring that the audit_field.val is only set when needed (it is cleared when the audit_field struct is allocated with kcalloc()). It also corrects a few other issues to ensure that in case of error the proper error code is returned. Cc: stable@vger.kernel.org Fixes: 219ca39427bf ("audit: use union for audit_field values since they are mutually exclusive") Reported-by: syzbot+1f4d90ead370d72e450b@syzkaller.appspotmail.com Signed-off-by: Paul Moore --- kernel/auditfilter.c | 71 +++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b0126e9c0743..026e34da4ace 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -456,6 +456,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 f_val; err = -EINVAL; @@ -464,12 +465,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; f->type = data->fields[i]; - f->val = data->values[i]; + f_val = data->values[i]; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { + if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; - f->val = 0; + f_val = 0; entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } @@ -485,7 +486,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_OBJ_UID: - f->uid = make_kuid(current_user_ns(), f->val); + f->uid = make_kuid(current_user_ns(), f_val); if (!uid_valid(f->uid)) goto exit_free; break; @@ -494,11 +495,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - f->gid = make_kgid(current_user_ns(), f->val); + f->gid = make_kgid(current_user_ns(), f_val); if (!gid_valid(f->gid)) goto exit_free; break; case AUDIT_ARCH: + f->val = f_val; entry->rule.arch_f = f; break; case AUDIT_SUBJ_USER: @@ -511,11 +513,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } + entry->rule.buflen += f_val; + f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they @@ -524,68 +528,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, pr_warn("audit rule for LSM \'%s\' is invalid\n", str); err = 0; - } - if (err) { - kfree(str); + } else if (err) goto exit_free; - } else - f->lsm_str = str; break; case AUDIT_WATCH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - - err = audit_to_watch(&entry->rule, str, f->val, f->op); + } + err = audit_to_watch(&entry->rule, str, f_val, f->op); if (err) { kfree(str); goto exit_free; } + entry->rule.buflen += f_val; break; case AUDIT_DIR: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } err = audit_make_tree(&entry->rule, str, f->op); kfree(str); if (err) goto exit_free; + entry->rule.buflen += f_val; break; case AUDIT_INODE: + f->val = f_val; err = audit_to_inode(&entry->rule, f); if (err) goto exit_free; break; case AUDIT_FILTERKEY: - if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) + if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; + } + entry->rule.buflen += f_val; entry->rule.filterkey = str; break; case AUDIT_EXE: - if (entry->rule.exe || f->val > PATH_MAX) + if (entry->rule.exe || f_val > PATH_MAX) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); + str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } - entry->rule.buflen += f->val; - - audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + audit_mark = audit_alloc_mark(&entry->rule, str, f_val); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); goto exit_free; } + entry->rule.buflen += f_val; entry->rule.exe = audit_mark; break; + default: + f->val = f_val; + break; } } -- cgit v1.2.3 From 756125289285f6e55a03861bf4b6257aa3d19a93 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 24 Feb 2020 16:38:57 -0500 Subject: audit: always check the netlink payload length in audit_receive_msg() This patch ensures that we always check the netlink payload length in audit_receive_msg() before we take any action on the payload itself. Cc: stable@vger.kernel.org Reported-by: syzbot+399c44bf1f43b8747403@syzkaller.appspotmail.com Reported-by: syzbot+e4b12d8d202701f08b6d@syzkaller.appspotmail.com Signed-off-by: Paul Moore --- kernel/audit.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 17b0d523afb3..9ddfe2aa6671 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1101,13 +1101,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature audit_log_end(ab); } -static int audit_set_feature(struct sk_buff *skb) +static int audit_set_feature(struct audit_features *uaf) { - struct audit_features *uaf; int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); - uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ @@ -1175,6 +1173,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; + int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; @@ -1188,6 +1187,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); + data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { @@ -1211,7 +1211,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) @@ -1315,7 +1315,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; break; case AUDIT_SET_FEATURE: - err = audit_set_feature(skb); + if (data_len < sizeof(struct audit_features)) + return -EINVAL; + err = audit_set_feature(data); if (err) return err; break; @@ -1327,6 +1329,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ + char *str = data; + err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); @@ -1334,26 +1338,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } audit_log_user_recv_msg(&ab, msg_type); - if (msg_type != AUDIT_USER_TTY) + if (msg_type != AUDIT_USER_TTY) { + /* ensure NULL termination */ + str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, - (char *)data); - else { - int size; - + str); + } else { audit_log_format(ab, " data="); - size = nlmsg_len(nlh); - if (size > 0 && - ((unsigned char *)data)[size - 1] == '\0') - size--; - audit_log_n_untrustedstring(ab, data, size); + if (data_len > 0 && str[data_len - 1] == '\0') + data_len--; + audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: - if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, @@ -1365,7 +1367,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); return -EPERM; } - err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh)); + err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); @@ -1380,7 +1382,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; - size_t msglen = nlmsg_len(nlh); + size_t msglen = data_len; char *old, *new; err = -EINVAL; @@ -1456,7 +1458,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) -- cgit v1.2.3 From c780e86dd48ef6467a1146cf7d0fe1e05a635039 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 6 Feb 2020 15:28:12 +0100 Subject: blktrace: Protect q->blk_trace with RCU KASAN is reporting that __blk_add_trace() has a use-after-free issue when accessing q->blk_trace. Indeed the switching of block tracing (and thus eventual freeing of q->blk_trace) is completely unsynchronized with the currently running tracing and thus it can happen that the blk_trace structure is being freed just while __blk_add_trace() works on it. Protect accesses to q->blk_trace by RCU during tracing and make sure we wait for the end of RCU grace period when shutting down tracing. Luckily that is rare enough event that we can afford that. Note that postponing the freeing of blk_trace to an RCU callback should better be avoided as it could have unexpected user visible side-effects as debugfs files would be still existing for a short while block tracing has been shut down. Link: https://bugzilla.kernel.org/show_bug.cgi?id=205711 CC: stable@vger.kernel.org Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Tested-by: Ming Lei Reviewed-by: Bart Van Assche Reported-by: Tristan Madani Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 114 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0735ae8545d8..4560878f0bac 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -335,6 +335,7 @@ static void put_probe_ref(void) static void blk_trace_cleanup(struct blk_trace *bt) { + synchronize_rcu(); blk_trace_free(bt); put_probe_ref(); } @@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (bt == NULL) return -EINVAL; @@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { mutex_lock(&q->blk_trace_mutex); - - if (q->blk_trace) { + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } @@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q) #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + /* We don't use the 'bt' value here except as an optimization... */ + bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; @@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what, u64 cgid) { - struct blk_trace *bt = rq->q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); @@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error, __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, error, 0, NULL, cgid); + rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, @@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, @@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } @@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; @@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } + rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct request_queue *q, struct bio *bio, unsigned int pdu) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); @@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } + rcu_read_unlock(); } /** @@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore, struct request_queue *q, struct bio *bio, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); @@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } /** @@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); @@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } /** @@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q) return -EINVAL; put_probe_ref(); + synchronize_rcu(); blk_trace_free(bt); return 0; } @@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q; struct block_device *bdev; + struct blk_trace *bt; ssize_t ret = -ENXIO; bdev = bdget(part_devt(p)); @@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - ret = sprintf(buf, "%u\n", !!q->blk_trace); + ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } - if (q->blk_trace == NULL) + if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); + ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->blk_trace_mutex); @@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev; struct request_queue *q; struct hd_struct *p; + struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - if (!!value == !!q->blk_trace) { + if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } @@ -1844,18 +1896,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (q->blk_trace == NULL) + if (bt == NULL) ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; + bt->act_mask = value; else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; + bt->pid = value; else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; + bt->start_lba = value; else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; + bt->end_lba = value; } out_unlock_bdev: -- cgit v1.2.3 From 2910b5aa6f545c044173a5cab3dbb7f43e23916d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 25 Feb 2020 23:36:41 +0900 Subject: bootconfig: Fix CONFIG_BOOTTIME_TRACING dependency issue Since commit d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by default") also changed the CONFIG_BOOTTIME_TRACING to select CONFIG_BOOT_CONFIG to show the boot-time tracing on the menu, it introduced wrong dependencies with BLK_DEV_INITRD as below. WARNING: unmet direct dependencies detected for BOOT_CONFIG Depends on [n]: BLK_DEV_INITRD [=n] Selected by [y]: - BOOTTIME_TRACING [=y] && TRACING_SUPPORT [=y] && FTRACE [=y] && TRACING [=y] This makes the CONFIG_BOOT_CONFIG selects CONFIG_BLK_DEV_INITRD to fix this error and make CONFIG_BOOTTIME_TRACING=n by default, so that both boot-time tracing and boot configuration off but those appear on the menu list. Link: http://lkml.kernel.org/r/158264140162.23842.11237423518607465535.stgit@devnote2 Fixes: d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by default") Reported-by: Randy Dunlap Compiled-tested-by: Randy Dunlap Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 795c3e02d3f1..402eef84c859 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -145,7 +145,6 @@ config BOOTTIME_TRACING bool "Boot-time Tracing support" depends on TRACING select BOOT_CONFIG - default y help Enable developer to setup ftrace subsystem via supplemental kernel cmdline at boot time for debugging (tracing) driver -- cgit v1.2.3 From fda31c50292a5062332fa0343c084bd9f46604d9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 24 Feb 2020 12:47:14 -0800 Subject: signal: avoid double atomic counter increments for user accounting When queueing a signal, we increment both the users count of pending signals (for RLIMIT_SIGPENDING tracking) and we increment the refcount of the user struct itself (because we keep a reference to the user in the signal structure in order to correctly account for it when freeing). That turns out to be fairly expensive, because both of them are atomic updates, and particularly under extreme signal handling pressure on big machines, you can get a lot of cache contention on the user struct. That can then cause horrid cacheline ping-pong when you do these multiple accesses. So change the reference counting to only pin the user for the _first_ pending signal, and to unpin it when the last pending signal is dequeued. That means that when a user sees a lot of concurrent signal queuing - which is the only situation when this matters - the only atomic access needed is generally the 'sigpending' count update. This was noticed because of a particularly odd timing artifact on a dual-socket 96C/192T Cascade Lake platform: when you get into bad contention, on that machine for some reason seems to be much worse when the contention happens in the upper 32-byte half of the cacheline. As a result, the kernel test robot will-it-scale 'signal1' benchmark had an odd performance regression simply due to random alignment of the 'struct user_struct' (and pointed to a completely unrelated and apparently nonsensical commit for the regression). Avoiding the double increments (and decrements on the dequeueing side, of course) makes for much less contention and hugely improved performance on that will-it-scale microbenchmark. Quoting Feng Tang: "It makes a big difference, that the performance score is tripled! bump from original 17000 to 54000. Also the gap between 5.0-rc6 and 5.0-rc6+Jiri's patch is reduced to around 2%" [ The "2% gap" is the odd cacheline placement difference on that platform: under the extreme contention case, the effect of which half of the cacheline was hot was 5%, so with the reduced contention the odd timing artifact is reduced too ] It does help in the non-contended case too, but is not nearly as noticeable. Reported-and-tested-by: Feng Tang Cc: Eric W. Biederman Cc: Huang, Ying Cc: Philip Li Cc: Andi Kleen Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/signal.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9ad8dea93dbb..5b2396350dd1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi { struct sigqueue *q = NULL; struct user_struct *user; + int sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. + * + * NOTE! A pending signal will hold on to the user refcount, + * and we get/put the refcount only when the sigpending count + * changes from/to zero. */ rcu_read_lock(); - user = get_uid(__task_cred(t)->user); - atomic_inc(&user->sigpending); + user = __task_cred(t)->user; + sigpending = atomic_inc_return(&user->sigpending); + if (sigpending == 1) + get_uid(user); rcu_read_unlock(); - if (override_rlimit || - atomic_read(&user->sigpending) <= - task_rlimit(t, RLIMIT_SIGPENDING)) { + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - atomic_dec(&user->sigpending); - free_uid(user); + if (atomic_dec_and_test(&user->sigpending)) + free_uid(user); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; @@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); + if (atomic_dec_and_test(&q->user->sigpending)) + free_uid(q->user); kmem_cache_free(sigqueue_cachep, q); } -- cgit v1.2.3 From 289de35984815576793f579ec27248609e75976e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 18 Feb 2020 15:45:34 +0100 Subject: sched/fair: Fix statistics for find_idlest_group() sgs->group_weight is not set while gathering statistics in update_sg_wakeup_stats(). This means that a group can be classified as fully busy with 0 running tasks if utilization is high enough. This path is mainly used for fork and exec. Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20200218144534.4564-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c8a379c357e..c1217bfe5e81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8337,6 +8337,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_capacity = group->sgc->capacity; + sgs->group_weight = group->group_weight; + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); /* -- cgit v1.2.3