From f49249d58abdc12067d69f15d509487171148b30 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 8 Oct 2019 11:46:46 +0100 Subject: PM: sleep: include for pm_wq Include the for the definition of pm_wq to avoid the following warning: kernel/power/main.c:890:25: warning: symbol 'pm_wq' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index e8710d179b35..e26de7af520b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "power.h" -- cgit v1.2.3 From ff229eee3d897f52bd001c841f2d3cce8853ecdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2019 10:32:04 -0700 Subject: hrtimer: Annotate lockless access to timer->base Followup to commit dd2261ed45aa ("hrtimer: Protect lockless access to timer->base") lock_hrtimer_base() fetches timer->base without lock exclusion. Compiler is allowed to read timer->base twice (even if considered dumb) which could end up trying to lock migration_base and return &migration_base. base = timer->base; if (likely(base != &migration_base)) { /* compiler reads timer->base again, and now (base == &migration_base) raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* == &migration_base ! */ Similarly the write sides must use WRITE_ONCE() to avoid store tearing. Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191008173204.180879-1-edumazet@google.com --- kernel/time/hrtimer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0d4dc241c0fb..65605530ee34 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -164,7 +164,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, struct hrtimer_clock_base *base; for (;;) { - base = timer->base; + base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) @@ -244,7 +244,7 @@ again: return base; /* See the comment in lock_hrtimer_base() */ - timer->base = &migration_base; + WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); @@ -253,10 +253,10 @@ again: raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; - timer->base = base; + WRITE_ONCE(timer->base, base); goto again; } - timer->base = new_base; + WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { -- cgit v1.2.3 From b67114db64adcb0aaff0fe0bb2a84ede0a99aaf2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 4 Oct 2019 13:10:09 +0200 Subject: parisc: sysctl.c: Use CONFIG_PARISC instead of __hppa_ define Signed-off-by: Helge Deller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 00fcea236eba..b6f2f35d0bcf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -163,7 +163,7 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_SPARC #endif -#ifdef __hppa__ +#ifdef CONFIG_PARISC extern int pwrsw_enabled; #endif @@ -620,7 +620,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef __hppa__ +#ifdef CONFIG_PARISC { .procname = "soft-power", .data = &pwrsw_enabled, -- cgit v1.2.3 From bc88f85c6c09306bd21917e1ae28205e9cd775a7 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 16 Oct 2019 12:24:58 +0100 Subject: kthread: make __kthread_queue_delayed_work static The __kthread_queue_delayed_work is not exported so make it static, to avoid the following sparse warning: kernel/kthread.c:869:6: warning: symbol '__kthread_queue_delayed_work' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Linus Torvalds --- kernel/kthread.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 621467c33fef..b262f47046ca 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -866,9 +866,9 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); -void __kthread_queue_delayed_work(struct kthread_worker *worker, - struct kthread_delayed_work *dwork, - unsigned long delay) +static void __kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; -- cgit v1.2.3 From 700dea5a0bea9f64eba89fae7cb2540326fdfdc1 Mon Sep 17 00:00:00 2001 From: Dmitry Goldin Date: Wed, 9 Oct 2019 13:42:14 +0000 Subject: kheaders: substituting --sort in archive creation The option --sort=ORDER was only introduced in tar 1.28 (2014), which is rather new and might not be available in some setups. This patch tries to replicate the previous behaviour as closely as possible to fix the kheaders build for older environments. It does not produce identical archives compared to the previous version due to minor sorting differences but produces reproducible results itself in my tests. Reported-by: Andreas Schwab Signed-off-by: Dmitry Goldin Tested-by: Andreas Schwab Tested-by: Quentin Perret Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index aff79e461fc9..5a0fc0b0403a 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -71,10 +71,13 @@ done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 find $cpio_dir -type f -print0 | xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' -# Create archive and try to normalize metadata for reproducibility -tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ - --owner=0 --group=0 --sort=name --numeric-owner \ - -Jcf $tarfile -C $cpio_dir/ . > /dev/null +# Create archive and try to normalize metadata for reproducibility. +# For compatibility with older versions of tar, files are fed to tar +# pre-sorted, as --sort=name might not be available. +find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ + tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --owner=0 --group=0 --numeric-owner --no-recursion \ + -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 -- cgit v1.2.3 From b1fc5833357524d5d342737913dbe32ff3557bc5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2019 11:45:36 +0100 Subject: stop_machine: Avoid potential race behaviour Both multi_cpu_stop() and set_state() access multi_stop_data::state racily using plain accesses. These are subject to compiler transformations which could break the intended behaviour of the code, and this situation is detected by KCSAN on both arm64 and x86 (splats below). Improve matters by using READ_ONCE() and WRITE_ONCE() to ensure that the compiler cannot elide, replay, or tear loads and stores. In multi_cpu_stop() the two loads of multi_stop_data::state are expected to be a consistent value, so snapshot the value into a temporary variable to ensure this. The state transitions are serialized by atomic manipulation of multi_stop_data::num_threads, and other fields in multi_stop_data are not modified while subject to concurrent reads. KCSAN splat on arm64: | BUG: KCSAN: data-race in multi_cpu_stop+0xa8/0x198 and set_state+0x80/0xb0 | | write to 0xffff00001003bd00 of 4 bytes by task 24 on cpu 3: | set_state+0x80/0xb0 | multi_cpu_stop+0x16c/0x198 | cpu_stopper_thread+0x170/0x298 | smpboot_thread_fn+0x40c/0x560 | kthread+0x1a8/0x1b0 | ret_from_fork+0x10/0x18 | | read to 0xffff00001003bd00 of 4 bytes by task 14 on cpu 1: | multi_cpu_stop+0xa8/0x198 | cpu_stopper_thread+0x170/0x298 | smpboot_thread_fn+0x40c/0x560 | kthread+0x1a8/0x1b0 | ret_from_fork+0x10/0x18 | | Reported by Kernel Concurrency Sanitizer on: | CPU: 1 PID: 14 Comm: migration/1 Not tainted 5.3.0-00007-g67ab35a199f4-dirty #3 | Hardware name: linux,dummy-virt (DT) KCSAN splat on x86: | write to 0xffffb0bac0013e18 of 4 bytes by task 19 on cpu 2: | set_state kernel/stop_machine.c:170 [inline] | ack_state kernel/stop_machine.c:177 [inline] | multi_cpu_stop+0x1a4/0x220 kernel/stop_machine.c:227 | cpu_stopper_thread+0x19e/0x280 kernel/stop_machine.c:516 | smpboot_thread_fn+0x1a8/0x300 kernel/smpboot.c:165 | kthread+0x1b5/0x200 kernel/kthread.c:255 | ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:352 | | read to 0xffffb0bac0013e18 of 4 bytes by task 44 on cpu 7: | multi_cpu_stop+0xb4/0x220 kernel/stop_machine.c:213 | cpu_stopper_thread+0x19e/0x280 kernel/stop_machine.c:516 | smpboot_thread_fn+0x1a8/0x300 kernel/smpboot.c:165 | kthread+0x1b5/0x200 kernel/kthread.c:255 | ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:352 | | Reported by Kernel Concurrency Sanitizer on: | CPU: 7 PID: 44 Comm: migration/7 Not tainted 5.3.0+ #1 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Acked-by: Marco Elver Link: https://lkml.kernel.org/r/20191007104536.27276-1-mark.rutland@arm.com --- kernel/stop_machine.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c7031a22aa7b..998d50ee2d9b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -7,6 +7,7 @@ * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo */ +#include #include #include #include @@ -167,7 +168,7 @@ static void set_state(struct multi_stop_data *msdata, /* Reset ack counter. */ atomic_set(&msdata->thread_ack, msdata->num_threads); smp_wmb(); - msdata->state = newstate; + WRITE_ONCE(msdata->state, newstate); } /* Last one to ack a state moves to the next state. */ @@ -186,7 +187,7 @@ void __weak stop_machine_yield(const struct cpumask *cpumask) static int multi_cpu_stop(void *data) { struct multi_stop_data *msdata = data; - enum multi_stop_state curstate = MULTI_STOP_NONE; + enum multi_stop_state newstate, curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; const struct cpumask *cpumask; unsigned long flags; @@ -210,8 +211,9 @@ static int multi_cpu_stop(void *data) do { /* Chill out and ensure we re-read multi_stop_state. */ stop_machine_yield(cpumask); - if (msdata->state != curstate) { - curstate = msdata->state; + newstate = READ_ONCE(msdata->state); + if (newstate != curstate) { + curstate = newstate; switch (curstate) { case MULTI_STOP_DISABLE_IRQ: local_irq_disable(); -- cgit v1.2.3 From 9fa8c9c647be624e91b09ecffa7cd97ee0600b40 Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Fri, 18 Oct 2019 09:20:34 +0800 Subject: tracing: Fix "gfp_t" format for synthetic events In the format of synthetic events, the "gfp_t" is shown as "signed:1", but in fact the "gfp_t" is "unsigned", should be shown as "signed:0". The issue can be reproduced by the following commands: echo 'memlatency u64 lat; unsigned int order; gfp_t gfp_flags; int migratetype' > /sys/kernel/debug/tracing/synthetic_events cat /sys/kernel/debug/tracing/events/synthetic/memlatency/format name: memlatency ID: 2233 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:u64 lat; offset:8; size:8; signed:0; field:unsigned int order; offset:16; size:4; signed:0; field:gfp_t gfp_flags; offset:24; size:4; signed:1; field:int migratetype; offset:32; size:4; signed:1; print fmt: "lat=%llu, order=%u, gfp_flags=%x, migratetype=%d", REC->lat, REC->order, REC->gfp_flags, REC->migratetype Link: http://lkml.kernel.org/r/20191018012034.6404-1-zhengjun.xing@linux.intel.com Reviewed-by: Tom Zanussi Signed-off-by: Zhengjun Xing Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 57648c5aa679..7482a1466ebf 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -679,6 +679,8 @@ static bool synth_field_signed(char *type) { if (str_has_prefix(type, "u")) return false; + if (strcmp(type, "gfp_t") == 0) + return false; return true; } -- cgit v1.2.3 From aa5de305c90c995321df452f274fe75b52bd8fcc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 18 Oct 2019 20:20:40 -0700 Subject: kernel/events/uprobes.c: only do FOLL_SPLIT_PMD for uprobe register Attaching uprobe to text section in THP splits the PMD mapped page table into PTE mapped entries. On uprobe detach, we would like to regroup PMD mapped page table entry to regain performance benefit of THP. However, the regroup is broken For perf_event based trace_uprobe. This is because perf_event based trace_uprobe calls uprobe_unregister twice on close: first in TRACE_REG_PERF_CLOSE, then in TRACE_REG_PERF_UNREGISTER. The second call will split the PMD mapped page table entry, which is not the desired behavior. Fix this by only use FOLL_SPLIT_PMD for uprobe register case. Add a WARN() to confirm uprobe unregister never work on huge pages, and abort the operation when this WARN() triggers. Link: http://lkml.kernel.org/r/20191017164223.2762148-6-songliubraving@fb.com Fixes: 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT") Signed-off-by: Song Liu Reviewed-by: Srikar Dronamraju Cc: Kirill A. Shutemov Cc: Oleg Nesterov Cc: Matthew Wilcox (Oracle) Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 94d38a39d72e..c74761004ee5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -474,14 +474,17 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; bool orig_page_huge = false; + unsigned int gup_flags = FOLL_FORCE; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); retry: + if (is_register) + gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -489,6 +492,12 @@ retry: if (ret <= 0) goto put_old; + if (WARN(!is_register && PageCompound(old_page), + "uprobe unregister should never work on compound page\n")) { + ret = -EINVAL; + goto put_old; + } + /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); -- cgit v1.2.3 From 77751a466ebd1a785456556061a2db6d60ea3898 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 16 Oct 2019 12:41:24 +0200 Subject: PM: QoS: Introduce frequency QoS Introduce frequency QoS, based on the "raw" low-level PM QoS, to represent min and max frequency requests and aggregate constraints. The min and max frequency requests are to be represented by struct freq_qos_request objects and the aggregate constraints are to be represented by struct freq_constraints objects. The latter are expected to be initialized with the help of freq_constraints_init(). The freq_qos_read_value() helper is defined to retrieve the aggregate constraints values from a given struct freq_constraints object and there are the freq_qos_add_request(), freq_qos_update_request() and freq_qos_remove_request() helpers to manipulate the min and max frequency requests. It is assumed that the the helpers will not run concurrently with each other for the same struct freq_qos_request object, so if that may be the case, their uses must ensure proper synchronization between them (e.g. through locking). In addition, freq_qos_add_notifier() and freq_qos_remove_notifier() are provided to add and remove notifiers that will trigger on aggregate constraint changes to and from a given struct freq_constraints object, respectively. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/power/qos.c | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9568a2fe7c11..04e83fdfbe80 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -650,3 +650,243 @@ static int __init pm_qos_power_init(void) } late_initcall(pm_qos_power_init); + +/* Definitions related to the frequency QoS below. */ + +/** + * freq_constraints_init - Initialize frequency QoS constraints. + * @qos: Frequency QoS constraints to initialize. + */ +void freq_constraints_init(struct freq_constraints *qos) +{ + struct pm_qos_constraints *c; + + c = &qos->min_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->type = PM_QOS_MAX; + c->notifiers = &qos->min_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); + + c = &qos->max_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->type = PM_QOS_MIN; + c->notifiers = &qos->max_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); +} + +/** + * freq_qos_read_value - Get frequency QoS constraint for a given list. + * @qos: Constraints to evaluate. + * @type: QoS request type. + */ +s32 freq_qos_read_value(struct freq_constraints *qos, + enum freq_qos_req_type type) +{ + s32 ret; + + switch (type) { + case FREQ_QOS_MIN: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MIN_DEFAULT_VALUE : + pm_qos_read_value(&qos->min_freq); + break; + case FREQ_QOS_MAX: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MAX_DEFAULT_VALUE : + pm_qos_read_value(&qos->max_freq); + break; + default: + WARN_ON(1); + ret = 0; + } + + return ret; +} + +/** + * freq_qos_apply - Add/modify/remove frequency QoS request. + * @req: Constraint request to apply. + * @action: Action to perform (add/update/remove). + * @value: Value to assign to the QoS request. + */ +static int freq_qos_apply(struct freq_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret; + + switch(req->type) { + case FREQ_QOS_MIN: + ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode, + action, value); + break; + case FREQ_QOS_MAX: + ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode, + action, value); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/** + * freq_qos_add_request - Insert new frequency QoS request into a given list. + * @qos: Constraints to update. + * @req: Preallocated request object. + * @type: Request type. + * @value: Request value. + * + * Insert a new entry into the @qos list of requests, recompute the effective + * QoS constraint value for that list and initialize the @req object. The + * caller needs to save that object for later use in updates and removal. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_add_request(struct freq_constraints *qos, + struct freq_qos_request *req, + enum freq_qos_req_type type, s32 value) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !req) + return -EINVAL; + + if (WARN(freq_qos_request_active(req), + "%s() called for active request\n", __func__)) + return -EINVAL; + + req->qos = qos; + req->type = type; + ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value); + if (ret < 0) { + req->qos = NULL; + req->type = 0; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_request); + +/** + * freq_qos_update_request - Modify existing frequency QoS request. + * @req: Request to modify. + * @new_value: New request value. + * + * Update an existing frequency QoS request along with the effective constraint + * value for the list of requests it belongs to. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) +{ + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + if (req->pnode.prio == new_value) + return 0; + + return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); +} +EXPORT_SYMBOL_GPL(freq_qos_update_request); + +/** + * freq_qos_remove_request - Remove frequency QoS request from its list. + * @req: Request to remove. + * + * Remove the given frequency QoS request from the list of constraints it + * belongs to and recompute the effective constraint value for that list. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_remove_request(struct freq_qos_request *req) +{ + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + return freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); +} +EXPORT_SYMBOL_GPL(freq_qos_remove_request); + +/** + * freq_qos_add_notifier - Add frequency QoS change notifier. + * @qos: List of requests to add the notifier to. + * @type: Request type. + * @notifier: Notifier block to add. + */ +int freq_qos_add_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_register(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_register(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_notifier); + +/** + * freq_qos_remove_notifier - Remove frequency QoS change notifier. + * @qos: List of requests to remove the notifier from. + * @type: Request type. + * @notifier: Notifier block to remove. + */ +int freq_qos_remove_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_remove_notifier); -- cgit v1.2.3 From 5e6c3c7b1ec217c1c4c95d9148182302b9969b97 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 21 Oct 2019 10:33:54 +0200 Subject: perf/aux: Fix tracking of auxiliary trace buffer allocation The following commit from the v5.4 merge window: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") ... breaks auxiliary trace buffer tracking. If I run command 'perf record -e rbd000' to record samples and saving them in the **auxiliary** trace buffer then the value of 'locked_vm' becomes negative after all trace buffers have been allocated and released: During allocation the values increase: [52.250027] perf_mmap user->locked_vm:0x87 pinned_vm:0x0 ret:0 [52.250115] perf_mmap user->locked_vm:0x107 pinned_vm:0x0 ret:0 [52.250251] perf_mmap user->locked_vm:0x188 pinned_vm:0x0 ret:0 [52.250326] perf_mmap user->locked_vm:0x208 pinned_vm:0x0 ret:0 [52.250441] perf_mmap user->locked_vm:0x289 pinned_vm:0x0 ret:0 [52.250498] perf_mmap user->locked_vm:0x309 pinned_vm:0x0 ret:0 [52.250613] perf_mmap user->locked_vm:0x38a pinned_vm:0x0 ret:0 [52.250715] perf_mmap user->locked_vm:0x408 pinned_vm:0x2 ret:0 [52.250834] perf_mmap user->locked_vm:0x408 pinned_vm:0x83 ret:0 [52.250915] perf_mmap user->locked_vm:0x408 pinned_vm:0x103 ret:0 [52.251061] perf_mmap user->locked_vm:0x408 pinned_vm:0x184 ret:0 [52.251146] perf_mmap user->locked_vm:0x408 pinned_vm:0x204 ret:0 [52.251299] perf_mmap user->locked_vm:0x408 pinned_vm:0x285 ret:0 [52.251383] perf_mmap user->locked_vm:0x408 pinned_vm:0x305 ret:0 [52.251544] perf_mmap user->locked_vm:0x408 pinned_vm:0x386 ret:0 [52.251634] perf_mmap user->locked_vm:0x408 pinned_vm:0x406 ret:0 [52.253018] perf_mmap user->locked_vm:0x408 pinned_vm:0x487 ret:0 [52.253197] perf_mmap user->locked_vm:0x408 pinned_vm:0x508 ret:0 [52.253374] perf_mmap user->locked_vm:0x408 pinned_vm:0x589 ret:0 [52.253550] perf_mmap user->locked_vm:0x408 pinned_vm:0x60a ret:0 [52.253726] perf_mmap user->locked_vm:0x408 pinned_vm:0x68b ret:0 [52.253903] perf_mmap user->locked_vm:0x408 pinned_vm:0x70c ret:0 [52.254084] perf_mmap user->locked_vm:0x408 pinned_vm:0x78d ret:0 [52.254263] perf_mmap user->locked_vm:0x408 pinned_vm:0x80e ret:0 The value of user->locked_vm increases to a limit then the memory is tracked by pinned_vm. During deallocation the size is subtracted from pinned_vm until it hits a limit. Then a larger value is subtracted from locked_vm leading to a large number (because of type unsigned): [64.267797] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x78d [64.267826] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x70c [64.267848] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x68b [64.267869] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x60a [64.267891] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x589 [64.267911] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x508 [64.267933] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x487 [64.267952] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x406 [64.268883] perf_mmap_close mmap_user->locked_vm:0x307 pinned_vm:0x406 [64.269117] perf_mmap_close mmap_user->locked_vm:0x206 pinned_vm:0x406 [64.269433] perf_mmap_close mmap_user->locked_vm:0x105 pinned_vm:0x406 [64.269536] perf_mmap_close mmap_user->locked_vm:0x4 pinned_vm:0x404 [64.269797] perf_mmap_close mmap_user->locked_vm:0xffffffffffffff84 pinned_vm:0x303 [64.270105] perf_mmap_close mmap_user->locked_vm:0xffffffffffffff04 pinned_vm:0x202 [64.270374] perf_mmap_close mmap_user->locked_vm:0xfffffffffffffe84 pinned_vm:0x101 [64.270628] perf_mmap_close mmap_user->locked_vm:0xfffffffffffffe04 pinned_vm:0x0 This value sticks for the user until system is rebooted, causing follow-on system calls using locked_vm resource limit to fail. Note: There is no issue using the normal trace buffer. In fact the issue is in perf_mmap_close(). During allocation auxiliary trace buffer memory is either traced as 'extra' and added to 'pinned_vm' or trace as 'user_extra' and added to 'locked_vm'. This applies for normal trace buffers and auxiliary trace buffer. However in function perf_mmap_close() all auxiliary trace buffer is subtraced from 'locked_vm' and never from 'pinned_vm'. This breaks the ballance. Signed-off-by: Thomas Richter Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: gor@linux.ibm.com Cc: hechaol@fb.com Cc: heiko.carstens@de.ibm.com Cc: linux-perf-users@vger.kernel.org Cc: songliubraving@fb.com Fixes: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") Link: https://lkml.kernel.org/r/20191021083354.67868-1-tmricht@linux.ibm.com [ Minor readability edits. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..f5d7950d1931 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5607,8 +5607,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) perf_pmu_output_stop(event); /* now it's safe to free the pages */ - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); + if (!rb->aux_mmap_locked) + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + else + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); -- cgit v1.2.3 From 6b1340cc00edeadd52ebd8a45171f38c8de2a387 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Tue, 15 Oct 2019 11:47:25 +0530 Subject: tracing: Fix race in perf_trace_buf initialization A race condition exists while initialiazing perf_trace_buf from perf_trace_init() and perf_kprobe_init(). CPU0 CPU1 perf_trace_init() mutex_lock(&event_mutex) perf_trace_event_init() perf_trace_event_reg() total_ref_count == 0 buf = alloc_percpu() perf_trace_buf[i] = buf tp_event->class->reg() //fails perf_kprobe_init() goto fail perf_trace_event_init() perf_trace_event_reg() fail: total_ref_count == 0 total_ref_count == 0 buf = alloc_percpu() perf_trace_buf[i] = buf tp_event->class->reg() total_ref_count++ free_percpu(perf_trace_buf[i]) perf_trace_buf[i] = NULL Any subsequent call to perf_trace_event_reg() will observe total_ref_count > 0, causing the perf_trace_buf to be always NULL. This can result in perf_trace_buf getting accessed from perf_trace_buf_alloc() without being initialized. Acquiring event_mutex in perf_kprobe_init() before calling perf_trace_event_init() should fix this race. The race caused the following bug: Unable to handle kernel paging request at virtual address 0000003106f2003c Mem abort info: ESR = 0x96000045 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000045 CM = 0, WnR = 1 user pgtable: 4k pages, 39-bit VAs, pgdp = ffffffc034b9b000 [0000003106f2003c] pgd=0000000000000000, pud=0000000000000000 Internal error: Oops: 96000045 [#1] PREEMPT SMP Process syz-executor (pid: 18393, stack limit = 0xffffffc093190000) pstate: 80400005 (Nzcv daif +PAN -UAO) pc : __memset+0x20/0x1ac lr : memset+0x3c/0x50 sp : ffffffc09319fc50 __memset+0x20/0x1ac perf_trace_buf_alloc+0x140/0x1a0 perf_trace_sys_enter+0x158/0x310 syscall_trace_enter+0x348/0x7c0 el0_svc_common+0x11c/0x368 el0_svc_handler+0x12c/0x198 el0_svc+0x8/0xc Ramdumps showed the following: total_ref_count = 3 perf_trace_buf = ( 0x0 -> NULL, 0x0 -> NULL, 0x0 -> NULL, 0x0 -> NULL) Link: http://lkml.kernel.org/r/1571120245-4186-1-git-send-email-prsood@codeaurora.org Cc: stable@vger.kernel.org Fixes: e12f03d7031a9 ("perf/core: Implement the 'perf_kprobe' PMU") Acked-by: Song Liu Signed-off-by: Prateek Sood Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0892e38ed6fb..a9dfa04ffa44 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -272,9 +272,11 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) goto out; } + mutex_lock(&event_mutex); ret = perf_trace_event_init(tp_event, p_event); if (ret) destroy_local_trace_kprobe(tp_event); + mutex_unlock(&event_mutex); out: kfree(func); return ret; @@ -282,8 +284,10 @@ out: void perf_kprobe_destroy(struct perf_event *p_event) { + mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); } -- cgit v1.2.3 From f3a519e4add93b7b31a6616f0b09635ff2e6a159 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 22 Oct 2019 10:39:40 +0300 Subject: perf/aux: Fix AUX output stopping Commit: 8a58ddae2379 ("perf/core: Fix exclusive events' grouping") allows CAP_EXCLUSIVE events to be grouped with other events. Since all of those also happen to be AUX events (which is not the case the other way around, because arch/s390), this changes the rules for stopping the output: the AUX event may not be on its PMU's context any more, if it's grouped with a HW event, in which case it will be on that HW event's context instead. If that's the case, munmap() of the AUX buffer can't find and stop the AUX event, potentially leaving the last reference with the atomic context, which will then end up freeing the AUX buffer. This will then trip warnings: Fix this by using the context's PMU context when looking for events to stop, instead of the event's PMU context. Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20191022073940.61814-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f5d7950d1931..bb3748d29b04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6949,7 +6949,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->pmu; + struct pmu *pmu = event->ctx->pmu; struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, -- cgit v1.2.3 From 086ee46b08634a999bcd1707eabe3b0dc1806674 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Tue, 22 Oct 2019 14:12:26 +0100 Subject: timers/sched_clock: Include local timekeeping.h for missing declarations Include the timekeeping.h header to get the declaration of the sched_clock_{suspend,resume} functions. Fixes the following sparse warnings: kernel/time/sched_clock.c:275:5: warning: symbol 'sched_clock_suspend' was not declared. Should it be static? kernel/time/sched_clock.c:286:6: warning: symbol 'sched_clock_resume' was not declared. Should it be static? Signed-off-by: Ben Dooks (Codethink) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191022131226.11465-1-ben.dooks@codethink.co.uk --- kernel/time/sched_clock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 142b07619918..dbd69052eaa6 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -17,6 +17,8 @@ #include #include +#include "timekeeping.h" + /** * struct clock_read_data - data required to read from sched_clock() * -- cgit v1.2.3 From 7f2cbcbcafbca5360efbd175b3320852b8f7c637 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 21 Oct 2019 15:44:12 +0800 Subject: posix-cpu-timers: Fix two trivial comments Recent changes modified the function arguments of thread_group_sample_cputime() and task_cputimers_expired(), but forgot to update the comments. Fix it up. [ tglx: Changed the argument name of task_cputimers_expired() as the pointer points to an array of samples. ] Fixes: b7be4ef1365d ("posix-cpu-timers: Switch thread group sampling to array") Fixes: 001f7971433a ("posix-cpu-timers: Make expiry checks array based") Signed-off-by: Yi Wang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1571643852-21848-1-git-send-email-wang.yi59@zte.com.cn --- kernel/time/posix-cpu-timers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 92a431981b1c..42d512fcfda2 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -266,7 +266,7 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, /** * thread_group_sample_cputime - Sample cputime for a given task * @tsk: Task for which cputime needs to be started - * @iimes: Storage for time samples + * @samples: Storage for time samples * * Called from sys_getitimer() to calculate the expiry time of an active * timer. That means group cputime accounting is already active. Called @@ -1038,12 +1038,12 @@ unlock: * member of @pct->bases[CLK].nextevt. False otherwise */ static inline bool -task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct) +task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct) { int i; for (i = 0; i < CPUCLOCK_MAX; i++) { - if (sample[i] >= pct->bases[i].nextevt) + if (samples[i] >= pct->bases[i].nextevt) return true; } return false; -- cgit v1.2.3