From 74b22c465cd2b6ff4b8cec3997512ec807e6e495 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 Aug 2015 09:42:34 +0930 Subject: params: don't ignore the rest of cmdline if parse_one() fails parse_args() just aborts after it hits an error, so other args at the same initcall level are simply ignored. This can lead to other hard-to-understand problems, for example my testing machine panics during the boot if I pass "locktorture.verbose=true". Change parse_args() to save the err code for return and continue. Signed-off-by: Oleg Nesterov Signed-off-by: Rusty Russell --- kernel/params.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index b6554aa71094..ed1e0a1cffa7 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -223,7 +223,7 @@ char *parse_args(const char *doing, int (*unknown)(char *param, char *val, const char *doing, void *arg)) { - char *param, *val; + char *param, *val, *err = NULL; /* Chew leading spaces */ args = skip_spaces(args); @@ -238,7 +238,7 @@ char *parse_args(const char *doing, args = next_arg(args, ¶m, &val); /* Stop at -- */ if (!val && strcmp(param, "--") == 0) - return args; + return err ?: args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, arg, unknown); @@ -247,24 +247,25 @@ char *parse_args(const char *doing, doing, param); switch (ret) { + case 0: + continue; case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); - return ERR_PTR(ret); + break; case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); - return ERR_PTR(ret); - case 0: break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); - return ERR_PTR(ret); + break; } + + err = ERR_PTR(ret); } - /* All parsed OK. */ - return NULL; + return err; } /* Lazy bastard, eh? */ -- cgit v1.2.3 From d741314fe8318c1af497b8cf04ea3976b9509eca Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 15 Sep 2015 02:37:48 -0400 Subject: devm_memunmap: use devres_release() Remove open coded call to memunmap. Cc: Christoph Hellwig Cc: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 72b0c66628b6..0756273437e0 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -131,9 +131,8 @@ EXPORT_SYMBOL(devm_memremap); void devm_memunmap(struct device *dev, void *addr) { - WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, - addr)); - memunmap(addr); + WARN_ON(devres_release(dev, devm_memremap_release, + devm_memremap_match, addr)); } EXPORT_SYMBOL(devm_memunmap); -- cgit v1.2.3 From b36f47617f6ce7c5e8e7c264b9d9ea0654d9f20a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 15 Sep 2015 02:42:20 -0400 Subject: devm_memremap: convert to return ERR_PTR Make devm_memremap consistent with the error return scheme of devm_memremap_pages to remove special casing in the pmem driver. Cc: Christoph Hellwig Cc: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 0756273437e0..0d818ce04129 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -116,7 +116,7 @@ void *devm_memremap(struct device *dev, resource_size_t offset, ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) - return NULL; + return ERR_PTR(-ENOMEM); addr = memremap(offset, size, flags); if (addr) { -- cgit v1.2.3 From 7eff93b7c99f5d0024aee677c6c92e32af22e1d2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 5 Oct 2015 20:35:55 -0400 Subject: devm_memremap_pages: use numa_mem_id Hint to closest numa node for the placement of newly allocated pages. As that is where the device's other allocations will originate by default when it does not specify a NUMA node. Cc: Ross Zwisler Reviewed-by: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 0d818ce04129..56fc4783879c 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -174,7 +174,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) nid = dev_to_node(dev); if (nid < 0) - nid = 0; + nid = numa_mem_id(); error = arch_add_memory(nid, res->start, resource_size(res), true); if (error) { -- cgit v1.2.3 From 538ea4aa44737127ce2b5c8511c7349d2abdcf9c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 5 Oct 2015 20:35:56 -0400 Subject: pmem, memremap: convert to numa aware allocations Given that pmem ranges come with numa-locality hints, arrange for the resulting driver objects to be obtained from node-local memory. Reviewed-by: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 56fc4783879c..3218e8b1fc28 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -114,7 +114,8 @@ void *devm_memremap(struct device *dev, resource_size_t offset, { void **ptr, *addr; - ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); + ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); if (!ptr) return ERR_PTR(-ENOMEM); @@ -165,8 +166,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) if (is_ram == REGION_INTERSECTS) return __va(res->start); - page_map = devres_alloc(devm_memremap_pages_release, - sizeof(*page_map), GFP_KERNEL); + page_map = devres_alloc_node(devm_memremap_pages_release, + sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 22b886dd1018093920c4250dee2a9a3cb7cff7b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Nov 2015 12:15:33 -0500 Subject: timers: Use proper base migration in add_timer_on() Regardless of the previous CPU a timer was on, add_timer_on() currently simply sets timer->flags to the new CPU. As the caller must be seeing the timer as idle, this is locally fine, but the timer leaving the old base while unlocked can lead to race conditions as follows. Let's say timer was on cpu 0. cpu 0 cpu 1 ----------------------------------------------------------------------------- del_timer(timer) succeeds del_timer(timer) lock_timer_base(timer) locks cpu_0_base add_timer_on(timer, 1) spin_lock(&cpu_1_base->lock) timer->flags set to cpu_1_base operates on @timer operates on @timer This triggered with mod_delayed_work_on() which contains "if (del_timer()) add_timer_on()" sequence eventually leading to the following oops. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] detach_if_pending+0x69/0x1a0 ... Workqueue: wqthrash wqthrash_workfunc [wqthrash] task: ffff8800172ca680 ti: ffff8800172d0000 task.ti: ffff8800172d0000 RIP: 0010:[] [] detach_if_pending+0x69/0x1a0 ... Call Trace: [] del_timer+0x44/0x60 [] try_to_grab_pending+0xb6/0x160 [] mod_delayed_work_on+0x33/0x80 [] wqthrash_workfunc+0x61/0x90 [wqthrash] [] process_one_work+0x1e8/0x650 [] worker_thread+0x4e/0x450 [] kthread+0xef/0x110 [] ret_from_fork+0x3f/0x70 Fix it by updating add_timer_on() to perform proper migration as __mod_timer() does. Reported-and-tested-by: Jeff Layton Signed-off-by: Tejun Heo Cc: Chris Worley Cc: bfields@fieldses.org Cc: Michael Skralivetsky Cc: Trond Myklebust Cc: Shaohua Li Cc: Jeff Layton Cc: kernel-team@fb.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20151029103113.2f893924@tlielax.poochiereds.net Link: http://lkml.kernel.org/r/20151104171533.GI5749@mtj.duckdns.org Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 74591ba9474f..bbc5d1114583 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -977,13 +977,29 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); + struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); + struct tvec_base *base; unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + + /* + * If @timer was on a different CPU, it should be migrated with the + * old base locked to prevent other operations proceeding with the + * wrong base locked. See lock_timer_base(). + */ + base = lock_timer_base(timer, &flags); + if (base != new_base) { + timer->flags |= TIMER_MIGRATING; + + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | cpu); + } + debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); -- cgit v1.2.3 From 8b1291994d8e5e621a8af7e165b106e50d04bbf1 Mon Sep 17 00:00:00 2001 From: Jiaxing Wang Date: Fri, 6 Nov 2015 16:04:16 +0800 Subject: tracing: Make tracing work when debugfs is not configured in Currently tracing_init_dentry() returns -ENODEV when debugfs is not configured in, which causes tracefs not populated with tracing files and directories, so we will get an empty directory even after we manually mount tracefs. We can make tracing_init_dentry() return NULL if debugfs is not configured in and can manually mount tracefs. But return -ENODEV if debugfs is configured in but not initialized or failed to create automount point as that would break backward compatibility with older tools. Link: http://lkml.kernel.org/r/1446797056-11683-1-git-send-email-hello.wjx@gmail.com Signed-off-by: Jiaxing Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2198a630ef58..08af79c106e1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6847,7 +6847,9 @@ struct dentry *tracing_init_dentry(void) if (tr->dir) return NULL; - if (WARN_ON(!debugfs_initialized())) + if (WARN_ON(!tracefs_initialized()) || + (IS_ENABLED(CONFIG_DEBUG_FS) && + WARN_ON(!debugfs_initialized()))) return ERR_PTR(-ENODEV); /* -- cgit v1.2.3 From d0164adc89f6bb374d304ffcc375c6d2652fe67d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:21 -0800 Subject: mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd __GFP_WAIT has been used to identify atomic context in callers that hold spinlocks or are in interrupts. They are expected to be high priority and have access one of two watermarks lower than "min" which can be referred to as the "atomic reserve". __GFP_HIGH users get access to the first lower watermark and can be called the "high priority reserve". Over time, callers had a requirement to not block when fallback options were available. Some have abused __GFP_WAIT leading to a situation where an optimisitic allocation with a fallback option can access atomic reserves. This patch uses __GFP_ATOMIC to identify callers that are truely atomic, cannot sleep and have no alternative. High priority users continue to use __GFP_HIGH. __GFP_DIRECT_RECLAIM identifies callers that can sleep and are willing to enter direct reclaim. __GFP_KSWAPD_RECLAIM to identify callers that want to wake kswapd for background reclaim. __GFP_WAIT is redefined as a caller that is willing to enter direct reclaim and wake kswapd for background reclaim. This patch then converts a number of sites o __GFP_ATOMIC is used by callers that are high priority and have memory pools for those requests. GFP_ATOMIC uses this flag. o Callers that have a limited mempool to guarantee forward progress clear __GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall into this category where kswapd will still be woken but atomic reserves are not used as there is a one-entry mempool to guarantee progress. o Callers that are checking if they are non-blocking should use the helper gfpflags_allow_blocking() where possible. This is because checking for __GFP_WAIT as was done historically now can trigger false positives. Some exceptions like dm-crypt.c exist where the code intent is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to flag manipulations. o Callers that built their own GFP flags instead of starting with GFP_KERNEL and friends now also need to specify __GFP_KSWAPD_RECLAIM. The first key hazard to watch out for is callers that removed __GFP_WAIT and was depending on access to atomic reserves for inconspicuous reasons. In some cases it may be appropriate for them to use __GFP_HIGH. The second key hazard is callers that assembled their own combination of GFP flags instead of starting with something like GFP_KERNEL. They may now wish to specify __GFP_KSWAPD_RECLAIM. It's almost certainly harmless if it's missed in most cases as other activity will wake kswapd. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 6 +++--- kernel/cgroup.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/smp.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8a056a32ded7..5ffcbd354a52 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1371,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(audit_filter_type(type))) return NULL; - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_DIRECT_RECLAIM) { if (audit_pid && audit_pid == current->pid) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; } while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { long sleep_time; sleep_time = timeout_start + audit_backlog_wait_time - jiffies; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9d0cce3f9ce..f1603c153890 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -299,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4e49cc4c9952..deae3907ac1e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; /* this guy won't enter reclaim */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5235dd4e1e2f..3a970604308f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) while (to_alloc-- > 0) { struct page *page; - page = alloc_image_page(__GFP_HIGHMEM); + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; diff --git a/kernel/smp.c b/kernel/smp.c index 07854477c164..d903c02223af 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), cpumask_var_t cpus; int cpu, ret; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); -- cgit v1.2.3 From 71baba4b92dc1fa1bc461742c6ab1942ec6034e9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:28 -0800 Subject: mm, page_alloc: rename __GFP_WAIT to __GFP_RECLAIM __GFP_WAIT was used to signal that the caller was in atomic context and could not sleep. Now it is possible to distinguish between true atomic context and callers that are not willing to sleep. The latter should clear __GFP_DIRECT_RECLAIM so kswapd will still wake. As clearing __GFP_WAIT behaves differently, there is a risk that people will clear the wrong flags. This patch renames __GFP_WAIT to __GFP_RECLAIM to clearly indicate what it does -- setting it allows all reclaim activity, clearing them prevents it. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Cc: Christoph Lameter Acked-by: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b2066fb5b10f..12cd989dadf6 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; @@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_WAIT | __GFP_HIGH); + __get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT | __GFP_NOWARN | - __GFP_NORETRY); + __GFP_RECLAIM | __GFP_HIGH : + __GFP_RECLAIM | __GFP_NOWARN | + __GFP_NORETRY); if (!page[i]) { if (i < LZO_CMP_PAGES) { -- cgit v1.2.3 From 3d9c637f4ae74b45d95bb6cbd793fbffad0a709c Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 6 Nov 2015 16:29:12 -0800 Subject: module: export param_free_charp() Change the param_free_charp() function from static to exported. It is used by zswap in the next patch ("zswap: use charp for zswap param strings"). Signed-off-by: Dan Streetman Acked-by: Rusty Russell Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index b6554aa71094..93a380a2345d 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -325,10 +325,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_charp); -static void param_free_charp(void *arg) +void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } +EXPORT_SYMBOL(param_free_charp); const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, -- cgit v1.2.3 From 3824657c522f19f85a76bd932821174a5557a382 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 6 Nov 2015 16:30:38 -0800 Subject: printk: prevent userland from spoofing kernel messages The following statement of ABI/testing/dev-kmsg is not quite right: It is not possible to inject messages from userspace with the facility number LOG_KERN (0), to make sure that the origin of the messages can always be reliably determined. Userland actually can inject messages with a facility of 0 by abusing the fact that the facility is stored in a u8 data type. By using a facility which is a multiple of 256 the assignment of msg->facility in log_store() implicitly truncates it to 0, i.e. LOG_KERN, allowing users of /dev/kmsg to spoof kernel messages as shown below: The following call... # printf '<%d>Kernel panic - not syncing: beer empty\n' 0 >/dev/kmsg ...leads to the following log entry (dmesg -x | tail -n 1): user :emerg : [ 66.137758] Kernel panic - not syncing: beer empty However, this call... # printf '<%d>Kernel panic - not syncing: beer empty\n' 0x800 >/dev/kmsg ...leads to the slightly different log entry (note the kernel facility): kern :emerg : [ 74.177343] Kernel panic - not syncing: beer empty Fix that by limiting the user provided facility to 8 bit right from the beginning and catch the truncation early. Fixes: 7ff9554bb578 ("printk: convert byte-buffer to variable-length...") Signed-off-by: Mathias Krause Cc: Greg Kroah-Hartman Cc: Petr Mladek Cc: Alex Elder Cc: Joe Perches Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b16f35487b67..2ce8826f1053 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -269,6 +269,9 @@ static u32 clear_idx; #define PREFIX_MAX 32 #define LOG_LINE_MAX (1024 - PREFIX_MAX) +#define LOG_LEVEL(v) ((v) & 0x07) +#define LOG_FACILITY(v) ((v) >> 3 & 0xff) + /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 @@ -612,7 +615,6 @@ struct devkmsg_user { static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; - int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ size_t len = iov_iter_count(from); @@ -642,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) line = buf; if (line[0] == '<') { char *endp = NULL; + unsigned int u; - i = simple_strtoul(line+1, &endp, 10); + u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { - level = i & 7; - if (i >> 3) - facility = i >> 3; + level = LOG_LEVEL(u); + if (LOG_FACILITY(u) != 0) + facility = LOG_FACILITY(u); endp++; len -= endp - line; line = endp; -- cgit v1.2.3 From 2e01fabe67ccaff1d59bda01e60a61f5fb0aa7b6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:19 -0800 Subject: signals: kill block_all_signals() and unblock_all_signals() It is hardly possible to enumerate all problems with block_all_signals() and unblock_all_signals(). Just for example, 1. block_all_signals(SIGSTOP/etc) simply can't help if the caller is multithreaded. Another thread can dequeue the signal and force the group stop. 2. Even is the caller is single-threaded, it will "stop" anyway. It will not sleep, but it will spin in kernel space until SIGCONT or SIGKILL. And a lot more. In short, this interface doesn't work at all, at least the last 10+ years. Daniel said: Yeah the only times I played around with the DRM_LOCK stuff was when old drivers accidentally deadlocked - my impression is that the entire DRM_LOCK thing was never really tested properly ;-) Hence I'm all for purging where this leaks out of the drm subsystem. Signed-off-by: Oleg Nesterov Acked-by: Daniel Vetter Acked-by: Dave Airlie Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 51 +-------------------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0f6bbbe77b46..f2cbd4ed5cd4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -/* - * Notify the system that a driver wants to block all signals for this - * process, and wants to be notified if any signals at all were to be - * sent/acted upon. If the notifier routine returns non-zero, then the - * signal will be acted upon after all. If the notifier routine returns 0, - * then then signal will be blocked. Only one block per process is - * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. - */ -void -block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier_mask = mask; - current->notifier_data = priv; - current->notifier = notifier; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -/* Notify the system that blocking has ended. */ - -void -unblock_all_signals(void) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier = NULL; - current->notifier_data = NULL; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) { struct sigqueue *q, *first = NULL; @@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = next_signal(pending, mask); - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { - if (!(current->notifier)(current->notifier_data)) { - clear_thread_flag(TIF_SIGPENDING); - return 0; - } - } - } - + if (sig) collect_signal(sig, pending, info); - } - return sig; } @@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); -EXPORT_SYMBOL(block_all_signals); -EXPORT_SYMBOL(unblock_all_signals); - /* * System call entry points. -- cgit v1.2.3 From 5fa534c987784c4811757a34c425aff3ce3b5037 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:31 -0800 Subject: coredump: ensure all coredumping tasks have SIGNAL_GROUP_COREDUMP task_will_free_mem() is wrong in many ways, and in particular the SIGNAL_GROUP_COREDUMP check is not reliable: a task can participate in the coredumping without SIGNAL_GROUP_COREDUMP bit set. change zap_threads() paths to always set SIGNAL_GROUP_COREDUMP even if other CLONE_VM processes can't react to SIGKILL. Fortunately, at least oom-kill case if fine; it kills all tasks sharing the same mm, so it should also kill the process which actually dumps the core. The change in prepare_signal() is not strictly necessary, it just ensures that the patch does not bring another subtle behavioural change. But it reminds us that this SIGNAL_GROUP_EXIT/COREDUMP case needs more changes. Signed-off-by: Oleg Nesterov Cc: David Rientjes Cc: Kyle Walker Acked-by: Michal Hocko Cc: Stanislav Kozina Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f2cbd4ed5cd4..c0b01fe24bbd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -788,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (signal->flags & SIGNAL_GROUP_COREDUMP) + if (!(signal->flags & SIGNAL_GROUP_EXIT)) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. -- cgit v1.2.3 From de90a6bcaede81f35e8caf4566d1006267230377 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 6 Nov 2015 16:32:45 -0800 Subject: kexec: use file name as the output message prefix kexec output message misses the prefix "kexec", when Dave Young split the kexec code. Now, we use file name as the output message prefix. Currently, the format of output message: [ 140.290795] SYSC_kexec_load: hello, world [ 140.291534] kexec: sanity_check_segment_list: hello, world Ideally, the format of output message: [ 30.791503] kexec: SYSC_kexec_load, Hello, world [ 79.182752] kexec_core: sanity_check_segment_list, Hello, world Remove the custom prefix "kexec" in output message. Signed-off-by: Minfei Huang Acked-by: Dave Young Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 ++ kernel/kexec_core.c | 4 ++-- kernel/kexec_file.c | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 4c5edc357923..d873b64fbddc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bd9f8a03cefa..11b64a63c0f8 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -6,7 +6,7 @@ * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void) crash_notes = __alloc_percpu(size, align); if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); + pr_warn("Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6a9a3f2a0e8e..b70ada0028d2 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -9,6 +9,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -- cgit v1.2.3 From 8639b46139b0e4ea3b1ab1c274e410ee327f1d89 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Fri, 6 Nov 2015 16:32:48 -0800 Subject: pidns: fix set/getpriority and ioprio_set/get in PRIO_USER mode setpriority(PRIO_USER, 0, x) will change the priority of tasks outside of the current pid namespace. This is in contrast to both the other modes of setpriority and the example of kill(-1). Fix this. getpriority and ioprio have the same failure mode, fix them too. Eric said: : After some more thinking about it this patch sounds justifiable. : : My goal with namespaces is not to build perfect isolation mechanisms : as that can get into ill defined territory, but to build well defined : mechanisms. And to handle the corner cases so you can use only : a single namespace with well defined results. : : In this case you have found the two interfaces I am aware of that : identify processes by uid instead of by pid. Which quite frankly is : weird. Unfortunately the weird unexpected cases are hard to handle : in the usual way. : : I was hoping for a little more information. Changes like this one we : have to be careful of because someone might be depending on the current : behavior. I don't think they are and I do think this make sense as part : of the pid namespace. Signed-off-by: Ben Segall Cc: Oleg Nesterov Cc: Al Viro Cc: Ambrose Feinstein Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index fa2f2f671a5c..6af9212ab5aa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); if (!uid_eq(uid, cred->uid)) @@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; -- cgit v1.2.3 From 08d78658f393fefaa2e6507ea052c6f8ef4002a2 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 6 Nov 2015 16:32:58 -0800 Subject: panic: release stale console lock to always get the logbuf printed out In some cases we may end up killing the CPU holding the console lock while still having valuable data in logbuf. E.g. I'm observing the following: - A crash is happening on one CPU and console_unlock() is being called on some other. - console_unlock() tries to print out the buffer before releasing the lock and on slow console it takes time. - in the meanwhile crashing CPU does lots of printk()-s with valuable data (which go to the logbuf) and sends IPIs to all other CPUs. - console_unlock() finishes printing previous chunk and enables interrupts before trying to print out the rest, the CPU catches the IPI and never releases console lock. This is not the only possible case: in VT/fb subsystems we have many other console_lock()/console_unlock() users. Non-masked interrupts (or receiving NMI in case of extreme slowness) will have the same result. Getting the whole console buffer printed out on crash should be top priority. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Vitaly Kuznetsov Cc: HATAYAMA Daisuke Cc: Masami Hiramatsu Cc: Jiri Kosina Cc: Baoquan He Cc: Prarit Bhargava Cc: Xie XiuQi Cc: Seth Jennings Cc: "K. Y. Srinivasan" Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 04e91ff7560b..4579dbb7ed87 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,7 @@ #include #include #include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -147,6 +148,15 @@ void panic(const char *fmt, ...) bust_spinlocks(0); + /* + * We may have ended up stopping the CPU holding the lock (in + * smp_send_stop()) while still having some valuable data in the console + * buffer. Try to acquire the lock then release it regardless of the + * result. The release will also print the buffers out. + */ + console_trylock(); + console_unlock(); + if (!panic_blink) panic_blink = no_blink; -- cgit v1.2.3 From 03e88ae6b369da2a26a6e09ad165e57d210789cd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 6 Nov 2015 22:07:26 +0300 Subject: tracing: Remove unused ftrace_cpu_disabled per cpu variable Since the ring buffer is lockless, there is no need to disable ftrace on CPU. And no one doing so: after commit 68179686ac67cb ("tracing: Remove ftrace_disable/enable_cpu()") ftrace_cpu_disabled stays the same after initialization, nothing changes it. ftrace_cpu_disabled shouldn't be used by any external module since it disables only function and graph_function tracers but not any other tracer. Link: http://lkml.kernel.org/r/1446836846-22239-1-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ------ kernel/trace/trace.h | 1 - kernel/trace/trace_functions_graph.c | 6 ------ 3 files changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 08af79c106e1..b11582618991 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -100,8 +100,6 @@ static DEFINE_PER_CPU(bool, trace_cmdline_save); */ static int tracing_disabled = 1; -DEFINE_PER_CPU(int, ftrace_cpu_disabled); - cpumask_var_t __read_mostly tracing_buffer_mask; /* @@ -1775,10 +1773,6 @@ trace_function(struct trace_array *tr, struct ring_buffer_event *event; struct ftrace_entry *entry; - /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), flags, pc); if (!event) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dd7620802e72..919d9d07686f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -667,7 +667,6 @@ extern int DYN_FTRACE_TEST_NAME2(void); extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; -DECLARE_PER_CPU(int, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 92382af7a213..a663cbb84107 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -288,9 +288,6 @@ int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return 0; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, sizeof(*entry), flags, pc); if (!event) @@ -403,9 +400,6 @@ void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, sizeof(*entry), flags, pc); if (!event) -- cgit v1.2.3 From 2fd59077755c44dbbd9b2fa89cf988235a3a6a2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Nov 2015 05:48:38 -0800 Subject: perf: Disable IRQs across RCU RS CS that acquires scheduler lock The perf_lock_task_context() function disables preemption across its RCU read-side critical section because that critical section acquires a scheduler lock. If there was a preemption during that RCU read-side critical section, the rcu_read_unlock() could attempt to acquire scheduler locks, resulting in deadlock. However, recent optimizations to expedited grace periods mean that IPI handlers that execute during preemptible RCU read-side critical sections can now cause the subsequent rcu_read_unlock() to acquire scheduler locks. Disabling preemption does nothiing to prevent these IPI handlers from executing, so these optimizations introduced a deadlock. In theory, this deadlock could be avoided by pulling all wakeups and printk()s out from rnp->lock critical sections, but in practice this would re-introduce some RCU CPU stall warning bugs. Given that acquiring scheduler locks entails disabling interrupts, these deadlocks can be avoided by disabling interrupts (instead of disabling preemption) across any RCU read-side critical that acquires scheduler locks and holds them across the rcu_read_unlock(). This commit therefore makes this change for perf_lock_task_context(). Reported-by: Dave Jones Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151104134838.GR29027@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ea02109aee77..f8e5c443d74e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1050,13 +1050,13 @@ retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when - * part of the read side critical section was preemptible -- see + * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read - * side critical section is non-preemptible. + * side critical section has interrupts disabled. */ - preempt_disable(); + local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { @@ -1070,21 +1070,22 @@ retry: * if so. If we locked the right context, then it * can't get swapped on us any more. */ - raw_spin_lock_irqsave(&ctx->lock, *flags); + raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock(&ctx->lock); rcu_read_unlock(); - preempt_enable(); + local_irq_restore(*flags); goto retry; } if (!atomic_inc_not_zero(&ctx->refcount)) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock(&ctx->lock); ctx = NULL; } } rcu_read_unlock(); - preempt_enable(); + if (!ctx) + local_irq_restore(*flags); return ctx; } -- cgit v1.2.3 From b71b437eedaed985062492565d9d421d975ae845 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Nov 2015 10:50:51 +0100 Subject: perf: Fix inherited events vs. tracepoint filters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arnaldo reported that tracepoint filters seem to misbehave (ie. not apply) on inherited events. The fix is obvious; filters are only set on the actual (parent) event, use the normal pattern of using this parent event for filters. This is safe because each child event has a reference to it. Reported-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Signed-off-by: Peter Zijlstra (Intel) Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frédéric Weisbecker Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Wang Nan Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20151102095051.GN17308@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f8e5c443d74e..98a4b9db7f37 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6909,6 +6909,10 @@ static int perf_tp_filter_match(struct perf_event *event, { void *record = data->raw->data; + /* only top level events have filters set */ + if (event->parent) + event = event->parent; + if (likely(!event->filter) || filter_match_preds(event->filter, record)) return 1; return 0; -- cgit v1.2.3 From 25b3e5a3344e1f700c1efec5b6f0199f04707fb1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 5 Nov 2015 15:56:22 -0500 Subject: sched/numa: Fix math underflow in task_tick_numa() The NUMA balancing code implements delays in scanning by advancing curr->node_stamp beyond curr->se.sum_exec_runtime. With unsigned math, that creates an underflow, which results in task_numa_work being queued all the time, even when we don't want to. Avoiding the math underflow makes it possible to reduce CPU overhead in the NUMA balancing code. Reported-and-tested-by: Jan Stancek Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgorman@suse.de Link: http://lkml.kernel.org/r/1446756983-28173-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 824aa9f501a3..f04fda8f669c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2302,7 +2302,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) now = curr->se.sum_exec_runtime; period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; - if (now - curr->node_stamp > period) { + if (now > curr->node_stamp + period) { if (!curr->node_stamp) curr->numa_scan_period = task_scan_min(curr); curr->node_stamp += period; -- cgit v1.2.3 From 79211c8ed19c055ca105502c8733800d442a0ae6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 9 Nov 2015 14:58:13 -0800 Subject: remove abs64() Switch everything to the new and more capable implementation of abs(). Mainly to give the new abs() a bit of a workout. Cc: Michal Nazarewicz Cc: John Stultz Cc: Ingo Molnar Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Masami Hiramatsu Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 2 +- kernel/time/timekeeping.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0d8fe8b8f727..1347882d131e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data) continue; /* Check the deviation from the watchdog clocksource. */ - if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", cs->name); pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b1356b7ae570..d563c1960302 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, negative = (tick_error < 0); /* Sort out the magnitude of the correction */ - tick_error = abs64(tick_error); + tick_error = abs(tick_error); for (adj = 0; tick_error > interval; adj++) tick_error >>= 1; -- cgit v1.2.3 From f70cd6b07e629f367bb9b1ac9d0e3e669eb325c0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 02:39:55 +0100 Subject: context_tracking: remove duplicate enabled check All calls to context_tracking_enter and context_tracking_exit are already checking context_tracking_is_enabled, except the context_tracking_user_enter and context_tracking_user_exit functions left in for the benefit of assembly calls. Pull the check up to those functions, by making them simple wrappers around the user_enter and user_exit inline functions. Cc: Frederic Weisbecker Cc: Paul McKenney Reviewed-by: Rik van Riel Tested-by: Rik van Riel Acked-by: Andy Lutomirski Signed-off-by: Paolo Bonzini --- kernel/context_tracking.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 0a495ab35bc7..6d4c6ce21275 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -62,15 +62,6 @@ void context_tracking_enter(enum ctx_state state) { unsigned long flags; - /* - * Repeat the user_enter() check here because some archs may be calling - * this from asm and if no CPU needs context tracking, they shouldn't - * go further. Repeat the check here until they support the inline static - * key check. - */ - if (!context_tracking_is_enabled()) - return; - /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: @@ -128,7 +119,7 @@ EXPORT_SYMBOL_GPL(context_tracking_enter); void context_tracking_user_enter(void) { - context_tracking_enter(CONTEXT_USER); + user_enter(); } NOKPROBE_SYMBOL(context_tracking_user_enter); @@ -148,9 +139,6 @@ void context_tracking_exit(enum ctx_state state) { unsigned long flags; - if (!context_tracking_is_enabled()) - return; - if (in_interrupt()) return; @@ -181,7 +169,7 @@ EXPORT_SYMBOL_GPL(context_tracking_exit); void context_tracking_user_exit(void) { - context_tracking_exit(CONTEXT_USER); + user_exit(); } NOKPROBE_SYMBOL(context_tracking_user_exit); -- cgit v1.2.3 From d0e536d89395ecd8ab78fe999dc4d6f5d140ce46 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 02:39:56 +0100 Subject: context_tracking: avoid irq_save/irq_restore on guest entry and exit guest_enter and guest_exit must be called with interrupts disabled, since they take the vtime_seqlock with write_seq{lock,unlock}. Therefore, it is not necessary to check for exceptions, nor to save/restore the IRQ state, when context tracking functions are called by guest_enter and guest_exit. Split the body of context_tracking_entry and context_tracking_exit out to __-prefixed functions, and use them from KVM. Rik van Riel has measured this to speed up a tight vmentry/vmexit loop by about 2%. Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: Paul McKenney Reviewed-by: Rik van Riel Tested-by: Rik van Riel Signed-off-by: Paolo Bonzini --- kernel/context_tracking.c | 64 ++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6d4c6ce21275..d8560ee3bab7 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -58,27 +58,13 @@ static void context_tracking_recursion_exit(void) * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void context_tracking_enter(enum ctx_state state) +void __context_tracking_enter(enum ctx_state state) { - unsigned long flags; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); - local_irq_save(flags); if (!context_tracking_recursion_enter()) - goto out_irq_restore; + return; if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { @@ -111,7 +97,27 @@ void context_tracking_enter(enum ctx_state state) __this_cpu_write(context_tracking.state, state); } context_tracking_recursion_exit(); -out_irq_restore: +} +NOKPROBE_SYMBOL(__context_tracking_enter); +EXPORT_SYMBOL_GPL(__context_tracking_enter); + +void context_tracking_enter(enum ctx_state state) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + local_irq_save(flags); + __context_tracking_enter(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); @@ -135,16 +141,10 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void context_tracking_exit(enum ctx_state state) +void __context_tracking_exit(enum ctx_state state) { - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); if (!context_tracking_recursion_enter()) - goto out_irq_restore; + return; if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { @@ -161,7 +161,19 @@ void context_tracking_exit(enum ctx_state state) __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); -out_irq_restore: +} +NOKPROBE_SYMBOL(__context_tracking_exit); +EXPORT_SYMBOL_GPL(__context_tracking_exit); + +void context_tracking_exit(enum ctx_state state) +{ + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + __context_tracking_exit(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); -- cgit v1.2.3 From 4717f133736dec10605da9e29e707144c8d486df Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 10 Nov 2015 11:58:12 +0200 Subject: genirq/PM: Restore system wake up from chained interrupts Commit e509bd7da149 ("genirq: Allow migration of chained interrupts by installing default action") breaks PCS wake up IRQ behaviour on TI OMAP based platforms (dra7-evm). TI OMAP IRQ wake up configuration: GIC-irqchip->PCM_IRQ |- omap_prcm_register_chain_handler |- PRCM-irqchip -> PRCM_IO_IRQ |- pcs_irq_chain_handler |- pinctrl-irqchip -> PCS_uart1_wakeup_irq This happens because IRQ PM code (irq/pm.c) is expected to ignore chained interrupts by default: static bool suspend_device_irq(struct irq_desc *desc) { if (!desc->action || desc->no_suspend_depth) return false; - it's expected !desc->action = true for chained interrupts; but, after above change, all chained interrupt descriptors will have default action handler installed - chained_action. As result, chained interrupts will be silently disabled during system suspend. Hence, fix it by introducing helper function irq_desc_is_chained() and use it in suspend_device_irq() for chained interrupts identification and skip them, once detected. Fixes: e509bd7da149 ("genirq: Allow migration of chained interrupts..") Signed-off-by: Grygorii Strashko Reviewed-by: Mika Westerberg Cc: Tony Lindgren Cc: Cc: Cc: Tony Lindgren Link: http://lkml.kernel.org/r/1447149492-20699-1-git-send-email-grygorii.strashko@ti.com Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 5 +++++ kernel/irq/pm.c | 3 ++- kernel/irq/proc.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 05c2188271b8..fcab63c66905 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -199,6 +199,11 @@ static inline int irq_desc_get_node(struct irq_desc *desc) return irq_common_data_get_node(&desc->irq_common_data); } +static inline int irq_desc_is_chained(struct irq_desc *desc) +{ + return (desc->action && desc->action == &chained_action); +} + #ifdef CONFIG_PM_SLEEP bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 21c62617a35a..84ab239a00e2 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -70,7 +70,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) static bool suspend_device_irq(struct irq_desc *desc) { - if (!desc->action || desc->no_suspend_depth) + if (!desc->action || irq_desc_is_chained(desc) || + desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(&desc->irq_data)) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a916cf144b65..a2c02fd5d6d0 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -475,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; - if ((!action || action == &chained_action) && !any_count) + if ((!action || irq_desc_is_chained(desc)) && !any_count) goto out; seq_printf(p, "%*d: ", prec, i); -- cgit v1.2.3 From e428abbbf616cd8fdd1162e4a624ad1d47b47544 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 10 Nov 2015 05:15:15 +0800 Subject: tracing: #ifdef out uses of max trace when CONFIG_TRACER_MAX_TRACE is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tracing_max_lat_fops is used only when TRACER_MAX_TRACE enabled, so also swith the related code. The related warning with defconfig under x86_64: CC kernel/trace/trace.o kernel/trace/trace.c:5466:37: warning: ‘tracing_max_lat_fops’ defined but not used [-Wunused-const-variable] static const struct file_operations tracing_max_lat_fops = { Signed-off-by: Chen Gang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b11582618991..87fb9801bd9e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4548,6 +4548,8 @@ out: return ret; } +#ifdef CONFIG_TRACER_MAX_TRACE + static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -4562,6 +4564,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); } +#endif + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -5463,12 +5467,14 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, }; +#endif static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, -- cgit v1.2.3 From a31d82d85afdcbdb8c4128dfd6146992dc6b3576 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Nov 2015 15:28:17 -0500 Subject: bpf_trace: Make dependent on PERF_EVENTS Arnd Bergmann reported: In my ARM randconfig tests, I'm getting a build error for newly added code in bpf_perf_event_read and bpf_perf_event_output whenever CONFIG_PERF_EVENTS is disabled: kernel/trace/bpf_trace.c: In function 'bpf_perf_event_read': kernel/trace/bpf_trace.c:203:11: error: 'struct perf_event' has no member named 'oncpu' if (event->oncpu != smp_processor_id() || ^ kernel/trace/bpf_trace.c:204:11: error: 'struct perf_event' has no member named 'pmu' event->pmu->count) This can happen when UPROBE_EVENT is enabled but KPROBE_EVENT is disabled. I'm not sure if that is a configuration we care about, otherwise we could prevent this case from occuring by adding Kconfig dependencies. Looking at this further, it's really that UPROBE_EVENT enables PERF_EVENTS. By just having BPF_EVENTS depend on PERF_EVENTS, then all is fine. Link: http://lkml.kernel.org/r/4525348.Aq9YoXkChv@wuerfel Reported-by: Arnd Bergmann Signed-off-by: Steven Rostedt Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1153c43428f3..f5727b586c8e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -434,7 +434,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT || UPROBE_EVENT + depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS bool default y help -- cgit v1.2.3 From e41b104c7dba92443e594e6bc86e4b0bf1cdf573 Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Fri, 6 Nov 2015 14:25:00 +0800 Subject: livepatch: x86: fix relocation computation with kASLR With kASLR enabled, old_addr provided by patch module is being shifted accrodingly so that the symbol lookups work. To have module relocations handled properly as well, the same transformation needs to be perfomed on relocation address information. [jkosina@suse.cz: extended / reworded changelog a bit] Reported-by: Cyril B. Signed-off-by: Zhou Chengming Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 6e5344112419..db545cbcdb89 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -294,6 +294,12 @@ static int klp_write_object_relocations(struct module *pmod, for (reloc = obj->relocs; reloc->name; reloc++) { if (!klp_is_module(obj)) { + +#if defined(CONFIG_RANDOMIZE_BASE) + /* If KASLR has been enabled, adjust old value accordingly */ + if (kaslr_enabled()) + reloc->val += kaslr_offset(); +#endif ret = klp_verify_vmlinux_symbol(reloc->name, reloc->val); if (ret) -- cgit v1.2.3 From 9d8a765211335cfdad464b90fb19f546af5706ae Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 20 Nov 2015 15:57:21 -0800 Subject: kernel/signal.c: unexport sigsuspend() sigsuspend() is nowhere used except in signal.c itself, so we can mark it static do not pollute the global namespace. But this patch is more than a boring cleanup patch, it fixes a real issue on UserModeLinux. UML has a special console driver to display ttys using xterm, or other terminal emulators, on the host side. Vegard reported that sometimes UML is unable to spawn a xterm and he's facing the following warning: WARNING: CPU: 0 PID: 908 at include/linux/thread_info.h:128 sigsuspend+0xab/0xc0() It turned out that this warning makes absolutely no sense as the UML xterm code calls sigsuspend() on the host side, at least it tries. But as the kernel itself offers a sigsuspend() symbol the linker choose this one instead of the glibc wrapper. Interestingly this code used to work since ever but always blocked signals on the wrong side. Some recent kernel change made the WARN_ON() trigger and uncovered the bug. It is a wonderful example of how much works by chance on computers. :-) Fixes: 68f3f16d9ad0f1 ("new helper: sigsuspend()") Signed-off-by: Richard Weinberger Reported-by: Vegard Nossum Tested-by: Vegard Nossum Acked-by: Oleg Nesterov Cc: [3.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c0b01fe24bbd..f3f1f7a972fd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3503,7 +3503,7 @@ SYSCALL_DEFINE0(pause) #endif -int sigsuspend(sigset_t *set) +static int sigsuspend(sigset_t *set) { current->saved_sigmask = current->blocked; set_current_blocked(set); -- cgit v1.2.3 From 7625b3a0007decf2b135cb47ca67abc78a7b1bc1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 20 Nov 2015 15:57:24 -0800 Subject: kernel/panic.c: turn off locks debug before releasing console lock Commit 08d78658f393 ("panic: release stale console lock to always get the logbuf printed out") introduced an unwanted bad unlock balance report when panic() is called directly and not from OOPS (e.g. from out_of_memory()). The difference is that in case of OOPS we disable locks debug in oops_enter() and on direct panic call nobody does that. Fixes: 08d78658f393 ("panic: release stale console lock to always get the logbuf printed out") Reported-by: kernel test robot Signed-off-by: Vitaly Kuznetsov Cc: HATAYAMA Daisuke Cc: Masami Hiramatsu Cc: Jiri Kosina Cc: Baoquan He Cc: Prarit Bhargava Cc: Xie XiuQi Cc: Seth Jennings Cc: "K. Y. Srinivasan" Cc: Jan Kara Cc: Petr Mladek Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 4579dbb7ed87..4b150bc0c6c1 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -152,8 +152,11 @@ void panic(const char *fmt, ...) * We may have ended up stopping the CPU holding the lock (in * smp_send_stop()) while still having some valuable data in the console * buffer. Try to acquire the lock then release it regardless of the - * result. The release will also print the buffers out. + * result. The release will also print the buffers out. Locks debug + * should be disabled to avoid reporting bad unlock balance when + * panic() is not being callled from OOPS. */ + debug_locks_off(); console_trylock(); console_unlock(); -- cgit v1.2.3 From b81f472a208d3e2b4392faa6d17037a89442f4ce Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 23 Nov 2015 10:35:36 -0500 Subject: ring-buffer: Update read stamp with first real commit on page Do not update the read stamp after swapping out the reader page from the write buffer. If the reader page is swapped out of the buffer before an event is written to it, then the read_stamp may get an out of date timestamp, as the page timestamp is updated on the first commit to that page. rb_get_reader_page() only returns a page if it has an event on it, otherwise it will return NULL. At that point, check if the page being returned has events and has not been read yet. Then at that point update the read_stamp to match the time stamp of the reader page. Cc: stable@vger.kernel.org # 2.6.30+ Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 75f1d05ea82d..4dd6d5bc4e11 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,12 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) -{ - cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; - cpu_buffer->reader_page->read = 0; -} - static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -3626,7 +3620,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; - rb_reset_reader_page(cpu_buffer); + cpu_buffer->reader_page->read = 0; if (overwrite != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; @@ -3636,6 +3630,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: + /* Update the read_stamp on the first event */ + if (reader && reader->read == 0) + cpu_buffer->read_stamp = reader->page->time_stamp; + arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); -- cgit v1.2.3 From bd1b7cd360f529394936f28746eb4aaa12d6770a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 23 Nov 2015 17:35:24 -0500 Subject: ring-buffer: Put back the length if crossed page with add_timestamp Commit fcc742eaad7c "ring-buffer: Add event descriptor to simplify passing data" added a descriptor that holds various data instead of passing around several variables through parameters. The problem was that one of the parameters was modified in a function and the code was designed not to have an effect on that modified parameter. Now that the parameter is a descriptor and any modifications to it are non-volatile, the size of the data could be unnecessarily expanded. Remove the extra space added if a timestamp was added and the event went across the page. Cc: stable@vger.kernel.org # 4.3+ Fixes: fcc742eaad7c "ring-buffer: Add event descriptor to simplify passing data" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4dd6d5bc4e11..9c6045a27ba3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2797,8 +2797,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, event = __rb_reserve_next(cpu_buffer, &info); - if (unlikely(PTR_ERR(event) == -EAGAIN)) + if (unlikely(PTR_ERR(event) == -EAGAIN)) { + if (info.add_timestamp) + info.length -= RB_LEN_TIME_EXTEND; goto again; + } if (!event) goto out_fail; -- cgit v1.2.3 From 81b1a832d79749058863cffe2c0ed4ef40f6e6ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Nov 2015 11:39:54 -0800 Subject: pidns: fix NULL dereference in __task_pid_nr_ns() I got a crash during a "perf top" session that was caused by a race in __task_pid_nr_ns() : pid_nr_ns() was inlined, but apparently compiler chose to read task->pids[type].pid twice, and the pid->level dereference crashed because we got a NULL pointer at the second read : if (pid && ns->level <= pid->level) { // CRASH Just use RCU API properly to solve this race, and not worry about "perf top" crashing hosts :( get_task_pid() can benefit from same fix. Signed-off-by: Eric Dumazet Signed-off-by: Linus Torvalds --- kernel/pid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index ca368793808e..78b3d9f80d44 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -467,7 +467,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) rcu_read_lock(); if (type != PIDTYPE_PID) task = task->group_leader; - pid = get_pid(task->pids[type].pid); + pid = get_pid(rcu_dereference(task->pids[type].pid)); rcu_read_unlock(); return pid; } @@ -528,7 +528,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; - nr = pid_nr_ns(task->pids[type].pid, ns); + nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); -- cgit v1.2.3