From d86ab9cff8b936aadde444d0e263a8db5ff0349b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 3 May 2017 14:30:48 +0100 Subject: cpufreq: schedutil: use now as reference when aggregating shared policy requests Currently, sugov_next_freq_shared() uses last_freq_update_time as a reference to decide when to start considering CPU contributions as stale. However, since last_freq_update_time is set by the last CPU that issued a frequency transition, this might cause problems in certain cases. In practice, the detection of stale utilization values fails whenever the CPU with such values was the last to update the policy. For example (and please note again that the SCHED_CPUFREQ_RT flag is not the problem here, but only the detection of after how much time that flag has to be considered stale), suppose a policy with 2 CPUs: CPU0 | CPU1 | | RT task scheduled | SCHED_CPUFREQ_RT is set | CPU1->last_update = now | freq transition to max | last_freq_update_time = now | more than TICK_NSEC nsecs | a small CFS wakes up | CPU0->last_update = now1 | delta_ns(CPU0) < TICK_NSEC* | CPU0's util is considered | delta_ns(CPU1) = | last_freq_update_time - | CPU1->last_update = 0 | < TICK_NSEC | CPU1 is still considered | CPU1->SCHED_CPUFREQ_RT is set | we stay at max (until CPU1 | exits from idle) | * delta_ns is actually negative as now1 > last_freq_update_time While last_freq_update_time is a sensible reference for rate limiting, it doesn't seem to be useful for working around stale CPU states. Fix the problem by always considering now (time) as the reference for deciding when CPUs have stale contributions. Signed-off-by: Juri Lelli Acked-by: Vincent Guittot Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 76877a62b5fa..622eed1b7658 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -245,11 +245,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned long util = 0, max = 1; unsigned int j; @@ -265,7 +264,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) * enough, don't take the CPU into account as it probably is * idle now (and clear iowait_boost for it). */ - delta_ns = last_freq_update_time - j_sg_cpu->last_update; + delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; continue; @@ -309,7 +308,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_RT_DL) next_f = sg_policy->policy->cpuinfo.max_freq; else - next_f = sugov_next_freq_shared(sg_cpu); + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } -- cgit v1.2.3 From 0bae5fd3330be0517fba697e6b228601d421fade Mon Sep 17 00:00:00 2001 From: Pushkar Jambhlekar Date: Thu, 11 May 2017 10:31:24 +0530 Subject: PM / hibernate: Declare variables as static Fixing sparse warnings: 'symbol not declared. Should it be static?' Signed-off-by: Pushkar Jambhlekar Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d79a38de425a..a628cccafa4a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1422,7 +1422,7 @@ static unsigned int nr_meta_pages; * Numbers of normal and highmem page frames allocated for hibernation image * before suspending devices. */ -unsigned int alloc_normal, alloc_highmem; +static unsigned int alloc_normal, alloc_highmem; /* * Memory bitmap used for marking saveable pages (during hibernation) or * hibernation image pages (during restore) -- cgit v1.2.3 From 33c35aa4817864e056fd772230b0c6b552e36ea2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 May 2017 09:34:06 -0400 Subject: cgroup: Prevent kill_css() from being called more than once The kill_css() function may be called more than once under the condition that the css was killed but not physically removed yet followed by the removal of the cgroup that is hosting the css. This patch prevents any harmm from being done when that happens. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.5+ --- kernel/cgroup/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c3c9a0e1b3c9..8d4e85eae42c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4265,6 +4265,11 @@ static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_mutex); + if (css->flags & CSS_DYING) + return; + + css->flags |= CSS_DYING; + /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. -- cgit v1.2.3 From e4eda884db7930cee434828759064b4711604078 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 22 May 2017 12:27:07 -0400 Subject: net: Make IP alignment calulations clearer. The assignmnet: ip_align = strict ? 2 : NET_IP_ALIGN; in compare_pkt_ptr_alignment() trips up Coverity because we can only get to this code when strict is true, therefore ip_align will always be 2 regardless of NET_IP_ALIGN's value. So just assign directly to '2' and explain the situation in the comment above. Reported-by: "Gustavo A. R. Silva" Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1eddb713b815..c72cd41f5b8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -808,11 +808,15 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, reg_off += reg->aux_off; } - /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking - * we force this to 2 which is universally what architectures use - * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + /* For platforms that do not have a Kconfig enabling + * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of + * NET_IP_ALIGN is universally set to '2'. And on platforms + * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get + * to this code only in strict mode where we want to emulate + * the NET_IP_ALIGN==2 checking. Therefore use an + * unconditional IP align value of '2'. */ - ip_align = strict ? 2 : NET_IP_ALIGN; + ip_align = 2; if ((ip_align + reg_off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", ip_align, reg_off, off, size); -- cgit v1.2.3 From 04dc1b2fff4e96cb4142227fbdc63c8871ad4ed9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2017 17:48:50 +0200 Subject: futex,rt_mutex: Fix rt_mutex_cleanup_proxy_lock() Markus reported that the glibc/nptl/tst-robustpi8 test was failing after commit: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()") The following trace shows the problem: ld-linux-x86-64-2161 [019] .... 410.760971: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_LOCK_PI ld-linux-x86-64-2161 [019] ...1 410.760972: lock_pi_update_atomic: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000875 ret=0 ld-linux-x86-64-2165 [011] .... 410.760978: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_UNLOCK_PI ld-linux-x86-64-2165 [011] d..1 410.760979: do_futex: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000871 ret=0 ld-linux-x86-64-2165 [011] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=0000 ld-linux-x86-64-2161 [019] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=ETIMEDOUT Task 2165 does an UNLOCK_PI, assigning the lock to the waiter task 2161 which then returns with -ETIMEDOUT. That wrecks the lock state, because now the owner isn't aware it acquired the lock and removes the pending robust list entry. If 2161 is killed, the robust list will not clear out this futex and the subsequent acquire on this futex will then (correctly) result in -ESRCH which is unexpected by glibc, triggers an internal assertion and dies. Task 2161 Task 2165 rt_mutex_wait_proxy_lock() timeout(); /* T2161 is still queued in the waiter list */ return -ETIMEDOUT; futex_unlock_pi() spin_lock(hb->lock); rtmutex_unlock() remove_rtmutex_waiter(T2161); mark_lock_available(); /* Make the next waiter owner of the user space side */ futex_uval = 2161; spin_unlock(hb->lock); spin_lock(hb->lock); rt_mutex_cleanup_proxy_lock() if (rtmutex_owner() !== current) ... return FAIL; .... return -ETIMEOUT; This means that rt_mutex_cleanup_proxy_lock() needs to call try_to_take_rt_mutex() so it can take over the rtmutex correctly which was assigned by the waker. If the rtmutex is owned by some other task then this call is harmless and just confirmes that the waiter is not able to acquire it. While there, fix what looks like a merge error which resulted in rt_mutex_cleanup_proxy_lock() having two calls to fixup_rt_mutex_waiters() and rt_mutex_wait_proxy_lock() not having any. Both should have one, since both potentially touch the waiter list. Fixes: 38d589f2fd08 ("futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()") Reported-by: Markus Trippelsdorf Bug-Spotted-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Florian Weimer Cc: Darren Hart Cc: Sebastian Andrzej Siewior Cc: Markus Trippelsdorf Link: http://lkml.kernel.org/r/20170519154850.mlomgdsd26drq5j6@hirez.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b95509416909..28cd09e635ed 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1785,12 +1785,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, int ret; raw_spin_lock_irq(&lock->wait_lock); - - set_current_state(TASK_INTERRUPTIBLE); - /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -1821,16 +1823,26 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, bool cleanup = false; raw_spin_lock_irq(&lock->wait_lock); + /* + * Do an unconditional try-lock, this deals with the lock stealing + * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() + * sets a NULL owner. + * + * We're not interested in the return value, because the subsequent + * test on rt_mutex_owner() will infer that. If the trylock succeeded, + * we will own the lock and it will have removed the waiter. If we + * failed the trylock, we're still not owner and we need to remove + * ourselves. + */ + try_to_take_rt_mutex(lock, current, waiter); /* * Unless we're the owner; we're still enqueued on the wait_list. * So check if we became owner, if not, take us off the wait_list. */ if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); - fixup_rt_mutex_waiters(lock); cleanup = true; } - /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. -- cgit v1.2.3 From 4d6501dce079c1eb6bf0b1d8f528a5e81770109e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 9 May 2017 09:39:59 +0200 Subject: kthread: Fix use-after-free if kthread fork fails If a kthread forks (e.g. usermodehelper since commit 1da5c46fa965) but fails in copy_process() between calling dup_task_struct() and setting p->set_child_tid, then the value of p->set_child_tid will be inherited from the parent and get prematurely freed by free_kthread_struct(). kthread() - worker_thread() - process_one_work() | - call_usermodehelper_exec_work() | - kernel_thread() | - _do_fork() | - copy_process() | - dup_task_struct() | - arch_dup_task_struct() | - tsk->set_child_tid = current->set_child_tid // implied | - ... | - goto bad_fork_* | - ... | - free_task(tsk) | - free_kthread_struct(tsk) | - kfree(tsk->set_child_tid) - ... - schedule() - __schedule() - wq_worker_sleeping() - kthread_data(task)->flags // UAF The problem started showing up with commit 1da5c46fa965 since it reused ->set_child_tid for the kthread worker data. A better long-term solution might be to get rid of the ->set_child_tid abuse. The comment in set_kthread_struct() also looks slightly wrong. Debugged-by: Jamie Iles Fixes: 1da5c46fa965 ("kthread: Make struct kthread kmalloc'ed") Signed-off-by: Vegard Nossum Acked-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: Jamie Iles Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170509073959.17858-1-vegard.nossum@oracle.com Signed-off-by: Thomas Gleixner --- kernel/fork.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d681f8f10d2d..b7cdea10239c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1553,6 +1553,18 @@ static __latent_entropy struct task_struct *copy_process( if (!p) goto fork_out; + /* + * This _must_ happen before we call free_task(), i.e. before we jump + * to any of the bad_fork_* labels. This is to avoid freeing + * p->set_child_tid which is (ab)used as a kthread's data pointer for + * kernel threads (PF_KTHREAD). + */ + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + ftrace_graph_init_task(p); rt_mutex_init_task(p); @@ -1716,11 +1728,6 @@ static __latent_entropy struct task_struct *copy_process( } } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; - /* - * Clear TID on mm_release()? - */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif -- cgit v1.2.3 From c70d9d809fdeecedb96972457ee45c49a232d97f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 May 2017 15:40:12 -0500 Subject: ptrace: Properly initialize ptracer_cred on fork When I introduced ptracer_cred I failed to consider the weirdness of fork where the task_struct copies the old value by default. This winds up leaving ptracer_cred set even when a process forks and the child process does not wind up being ptraced. Because ptracer_cred is not set on non-ptraced processes whose parents were ptraced this has broken the ability of the enlightenment window manager to start setuid children. Fix this by properly initializing ptracer_cred in ptrace_init_task This must be done with a little bit of care to preserve the current value of ptracer_cred when ptrace carries through fork. Re-reading the ptracer_cred from the ptracing process at this point is inconsistent with how PT_PTRACE_CAP has been maintained all of these years. Tested-by: Takashi Iwai Fixes: 64b875f7ac8a ("ptrace: Capture the ptracer's creds not PT_PTRACE_CAP") Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 266ddcc1d8bb..60f356d91060 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, } +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, + const struct cred *ptracer_cred) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; + child->ptracer_cred = get_cred(ptracer_cred); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; rcu_read_lock(); - child->ptracer_cred = get_cred(__task_cred(new_parent)); + __ptrace_link(child, new_parent, __task_cred(new_parent)); rcu_read_unlock(); } @@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request, flags |= PT_SEIZED; task->ptrace = flags; - __ptrace_link(task, current); + ptrace_link(task, current); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -459,7 +465,7 @@ static int ptrace_traceme(void) */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); + ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); -- cgit v1.2.3 From 43fe8b8eb81eee713400340716cf945f59d21496 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 May 2017 23:27:38 +0200 Subject: posix-timers: Make signal printks conditional A recent commit added extra printks for CPU/RT limits. This can result in excessive spam in dmesg. Make the printks conditional on print_fatal_signals. Fixes: e7ea7c9806a2 ("rlimits: Print more information when CPU/RT limits are exceeded") Reported-by: Dave Jones Signed-off-by: Thomas Gleixner Cc: Arun Raghavan --- kernel/time/posix-cpu-timers.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1370f067fb51..d2a1e6dd0291 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -825,8 +825,10 @@ static void check_thread_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ - pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -838,8 +840,10 @@ static void check_thread_timers(struct task_struct *tsk, soft += USEC_PER_SEC; sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; } - pr_info("RT Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } @@ -936,8 +940,10 @@ static void check_process_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ - pr_info("RT Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -945,8 +951,10 @@ static void check_process_timers(struct task_struct *tsk, /* * At the soft limit, send a SIGXCPU every second. */ - pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); if (soft < hard) { soft++; -- cgit v1.2.3 From 41c25707d21716826e3c1f60967f5550610ec1c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 May 2017 12:03:48 -0400 Subject: cpuset: consider dying css as offline In most cases, a cgroup controller don't care about the liftimes of cgroups. For the controller, a css becomes online when ->css_online() is called on it and offline when ->css_offline() is called. However, cpuset is special in that the user interface it exposes cares whether certain cgroups exist or not. Combined with the RCU delay between cgroup removal and css offlining, this can lead to user visible behavior oddities where operations which should succeed after cgroup removals fail for some time period. The effects of cgroup removals are delayed when seen from userland. This patch adds css_is_dying() which tests whether offline is pending and updates is_cpuset_online() so that the function returns false also while offline is pending. This gets rid of the userland visible delays. Signed-off-by: Tejun Heo Reported-by: Daniel Jordan Link: http://lkml.kernel.org/r/327ca1f5-7957-fbb9-9e5f-9ba149d40ba2@oracle.com Cc: stable@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f6501f4f6040..ae643412948a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -176,9 +176,9 @@ typedef enum { } cpuset_flagbits_t; /* convenient tests for these bits */ -static inline bool is_cpuset_online(const struct cpuset *cs) +static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags); + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) -- cgit v1.2.3 From 1ad2f5838d345e1c102bd1cd27c4f4c1349b0dc8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:05 +0200 Subject: bpf: fix incorrect pruning decision when alignment must be tracked Currently, when we enforce alignment tracking on direct packet access, the verifier lets the following program pass despite doing a packet write with unaligned access: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (bf) r0 = r2 4: (07) r0 += 14 5: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 6: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 7: (63) *(u32 *)(r0 -4) = r0 8: (b7) r0 = 0 9: (95) exit from 6 to 8: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 8: (b7) r0 = 0 9: (95) exit from 5 to 10: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R10=fp 10: (07) r0 += 1 11: (05) goto pc-6 6: safe <----- here, wrongly found safe processed 15 insns However, if we enforce a pruning mismatch by adding state into r8 which is then being mismatched in states_equal(), we find that for the otherwise same program, the verifier detects a misaligned packet access when actually walking that path: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (b7) r8 = 1 4: (bf) r0 = r2 5: (07) r0 += 14 6: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 9: (b7) r0 = 0 10: (95) exit from 7 to 9: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 9: (b7) r0 = 0 10: (95) exit from 6 to 11: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 11: (07) r0 += 1 12: (b7) r8 = 0 13: (05) goto pc-7 <----- mismatch due to r8 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=15,r=15) R1=ctx R2=pkt(id=0,off=0,r=15) R3=pkt_end R7=inv,min_value=2 R8=imm0,min_value=0,max_value=0,min_align=2147483648 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 misaligned packet access off 2+15+-4 size 4 The reason why we fail to see it in states_equal() is that the third test in compare_ptrs_to_packet() ... if (old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; ... will let the above pass. The situation we run into is that old->off <= cur->off (14 <= 15), meaning that prior walked paths went with smaller offset, which was later used in the packet access after successful packet range check and found to be safe already. For example: Given is R0=pkt(id=0,off=0,r=0). Adding offset 14 as in above program to it, results in R0=pkt(id=0,off=14,r=0) before the packet range test. Now, testing this against R3=pkt_end with 'if r0 > r3 goto out' will transform R0 into R0=pkt(id=0,off=14,r=14) for the case when we're within bounds. A write into the packet at offset *(u32 *)(r0 -4), that is, 2 + 14 -4, is valid and aligned (2 is for NET_IP_ALIGN). After processing this with all fall-through paths, we later on check paths from branches. When the above skb->mark test is true, then we jump near the end of the program, perform r0 += 1, and jump back to the 'if r0 > r3 goto out' test we've visited earlier already. This time, R0 is of type R0=pkt(id=0,off=15,r=0), and we'll prune that part because this time we'll have a larger safe packet range, and we already found that with off=14 all further insn were already safe, so it's safe as well with a larger off. However, the problem is that the subsequent write into the packet with 2 + 15 -4 is then unaligned, and not caught by the alignment tracking. Note that min_align, aux_off, and aux_off_align were all 0 in this example. Since we cannot tell at this time what kind of packet access was performed in the prior walk and what minimal requirements it has (we might do so in the future, but that requires more complexity), fix it to disable this pruning case for strict alignment for now, and let the verifier do check such paths instead. With that applied, the test cases pass and reject the program due to misalignment. Fixes: d1174416747d ("bpf: Track alignment of register values in the verifier.") Reference: http://patchwork.ozlabs.org/patch/761909/ Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c72cd41f5b8b..e37e06b1229d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -843,9 +843,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, { bool strict = env->strict_alignment; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - strict = true; - switch (reg->type) { case PTR_TO_PACKET: return check_pkt_ptr_alignment(reg, off, size, strict); @@ -2696,7 +2693,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct bpf_reg_state *old, +static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, + struct bpf_reg_state *old, struct bpf_reg_state *cur) { if (old->id != cur->id) @@ -2739,7 +2737,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, * 'if (R4 > data_end)' and all further insn were already good with r=20, * so they will be good with r=30 and we can prune the search. */ - if (old->off <= cur->off && + if (!env->strict_alignment && old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; @@ -2810,7 +2808,7 @@ static bool states_equal(struct bpf_verifier_env *env, continue; if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && - compare_ptrs_to_packet(rold, rcur)) + compare_ptrs_to_packet(env, rold, rcur)) continue; return false; @@ -3588,10 +3586,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) } else { log_level = 0; } - if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT) + + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - else - env->strict_alignment = false; ret = replace_map_fd_with_map_ptr(env); if (ret < 0) @@ -3697,7 +3695,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, mutex_lock(&bpf_verifier_lock); log_level = 0; + env->strict_alignment = false; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + env->strict_alignment = true; env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), -- cgit v1.2.3 From a9789ef9afcb4fb0193f8dd94f2665ba3ad71e79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:06 +0200 Subject: bpf: properly reset caller saved regs after helper call and ld_abs/ind Currently, after performing helper calls, we clear all caller saved registers, that is r0 - r5 and fill r0 depending on struct bpf_func_proto specification. The way we reset these regs can affect pruning decisions in later paths, since we only reset register's imm to 0 and type to NOT_INIT. However, we leave out clearing of other variables such as id, min_value, max_value, etc, which can later on lead to pruning mismatches due to stale data. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e37e06b1229d..339c8a1371de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -463,19 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + + memset(®s[regno], 0, sizeof(regs[regno])); + regs[regno].type = NOT_INIT; + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + static void init_reg_state(struct bpf_reg_state *regs) { int i; - for (i = 0; i < MAX_BPF_REG; i++) { - regs[i].type = NOT_INIT; - regs[i].imm = 0; - regs[i].min_value = BPF_REGISTER_MIN_RANGE; - regs[i].max_value = BPF_REGISTER_MAX_RANGE; - regs[i].min_align = 0; - regs[i].aux_off = 0; - regs[i].aux_off_align = 0; - } + for (i = 0; i < MAX_BPF_REG; i++) + mark_reg_not_init(regs, i); /* frame pointer */ regs[BPF_REG_FP].type = FRAME_PTR; @@ -1346,7 +1349,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs = state->regs; - struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1413,11 +1415,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } /* reset caller saved regs */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* update return register */ if (fn->ret_type == RET_INTEGER) { @@ -2445,7 +2444,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -2478,11 +2476,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* reset caller saved regs to unreadable */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* mark destination R0 register as readable, since it contains * the value fetched from the packet -- cgit v1.2.3 From a316338cb71a3260201490e615f2f6d5c0d8fb2c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:08 +0200 Subject: bpf: fix wrong exposure of map_flags into fdinfo for lpm trie_alloc() always needs to have BPF_F_NO_PREALLOC passed in via attr->map_flags, since it does not support preallocation yet. We check the flag, but we never copy the flag into trie->map.map_flags, which is later on exposed into fdinfo and used by loaders such as iproute2. Latter uses this in bpf_map_selfcheck_pinned() to test whether a pinned map has the same spec as the one from the BPF obj file and if not, bails out, which is currently the case for lpm since it exposes always 0 as flags. Also copy over flags in array_map_alloc() and stack_map_alloc(). They always have to be 0 right now, but we should make sure to not miss to copy them over at a later point in time when we add actual flags for them to use. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Reported-by: Jarno Rajahalme Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 1 + kernel/bpf/lpm_trie.c | 1 + kernel/bpf/stackmap.c | 1 + 3 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5e00b2333c26..172dc8ee0e3b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; + array->map.map_flags = attr->map_flags; array->elem_size = elem_size; if (!percpu) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 39cfafd895b8..b09185f0f17d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) trie->map.key_size = attr->key_size; trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; + trie->map.map_flags = attr->map_flags; trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4dfd6f2ec2f9..31147d730abf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.key_size = attr->key_size; smap->map.value_size = value_size; smap->map.max_entries = attr->max_entries; + smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; -- cgit v1.2.3 From 5720acf4bfc142ba568d5b6782fceaf62ed15e0b Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Fri, 26 May 2017 14:45:21 +0200 Subject: livepatch: Make livepatch dependent on !TRIM_UNUSED_KSYMS If TRIM_UNUSED_KSYMS is enabled, all unneeded exported symbols are made unexported. Two-pass build of the kernel is done to find out which symbols are needed based on a configuration. This effectively complicates things for out-of-tree modules. Livepatch exports functions to (un)register and enable/disable a live patch. The only in-tree module which uses these functions is a sample in samples/livepatch/. If the sample is disabled, the functions are trimmed and out-of-tree live patches cannot be built. Note that live patches are intended to be built out-of-tree. Suggested-by: Michal Marek Acked-by: Josh Poimboeuf Acked-by: Jessica Yu Signed-off-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 045022557936..ec4565122e65 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -10,6 +10,7 @@ config LIVEPATCH depends on SYSFS depends on KALLSYMS_ALL depends on HAVE_LIVEPATCH + depends on !TRIM_UNUSED_KSYMS help Say Y here if you want to support kernel live patching. This option has no runtime impact until a kernel "patch" -- cgit v1.2.3 From f9797c2f20c0160edd718aa467101f3301e57e59 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Thu, 25 May 2017 16:20:38 +0100 Subject: ftrace: Fix memory leak in ftrace_graph_release() ftrace_hash is being kfree'ed in ftrace_graph_release(), however the ->buckets field is not. This results in a memory leak that is easily captured by kmemleak: unreferenced object 0xffff880038afe000 (size 8192): comm "trace-cmd", pid 238, jiffies 4294916898 (age 9.736s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xb0 [] __kmalloc+0x12d/0x1a0 [] alloc_ftrace_hash+0x51/0x80 [] __ftrace_graph_open.isra.39.constprop.46+0xa3/0x100 [] ftrace_graph_open+0x68/0xa0 [] do_dentry_open.isra.1+0x1bd/0x2d0 [] vfs_open+0x47/0x60 [] path_openat+0x2a5/0x1020 [] do_filp_open+0x8a/0xf0 [] do_sys_open+0x12f/0x200 [] SyS_open+0x1e/0x20 [] entry_SYSCALL_64_fastpath+0x13/0x94 [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/20170525152038.7661-1-lhenriques@suse.com Cc: stable@vger.kernel.org Fixes: b9b0c831bed2 ("ftrace: Convert graph filter to use hash tables") Signed-off-by: Luis Henriques Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 74fdfe9ed3db..9e5841dc14b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5063,7 +5063,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) } out: - kfree(fgd->new_hash); + free_ftrace_hash(fgd->new_hash); kfree(fgd); return ret; -- cgit v1.2.3 From c93f5cf571e7795f97d49ef51b766cf25e328545 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 May 2017 19:38:17 +0900 Subject: kprobes/x86: Fix to set RWX bits correctly before releasing trampoline Fix kprobes to set(recover) RWX bits correctly on trampoline buffer before releasing it. Releasing readonly page to module_memfree() crash the kernel. Without this fix, if kprobes user register a bunch of kprobes in function body (since kprobes on function entry usually use ftrace) and unregister it, kernel hits a BUG and crash. Link: http://lkml.kernel.org/r/149570868652.3518.14120169373590420503.stgit@devbox Signed-off-by: Masami Hiramatsu Fixes: d0381c81c2f7 ("kprobes/x86: Set kprobes pages read-only") Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d2d3a568e4e..adfe3b4cfe05 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -122,7 +122,7 @@ static void *alloc_insn_page(void) return module_alloc(PAGE_SIZE); } -static void free_insn_page(void *page) +void __weak free_insn_page(void *page) { module_memfree(page); } -- cgit v1.2.3 From 40da1b11f01e43aad1aa6cea64681b6125e8a2a7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 2 Jun 2017 16:27:14 +0200 Subject: cpu/hotplug: Drop the device lock on error If a custom CPU target is specified and that one is not available _or_ can't be interrupted then the code returns to userland without dropping a lock as notices by lockdep: |echo 133 > /sys/devices/system/cpu/cpu7/hotplug/target | ================================================ | [ BUG: lock held when returning to user space! ] | ------------------------------------------------ | bash/503 is leaving the kernel with locks still held! | 1 lock held by bash/503: | #0: (device_hotplug_lock){+.+...}, at: [] lock_device_hotplug_sysfs+0x10/0x40 So release the lock then. Fixes: 757c989b9994 ("cpu/hotplug: Make target state writeable") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170602142714.3ogo25f2wbq6fjpj@linutronix.de --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ae6fbe5b5cf..cb5103413bd8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1658,13 +1658,13 @@ static ssize_t write_cpuhp_target(struct device *dev, ret = !sp->name || sp->cant_stop ? -EINVAL : 0; mutex_unlock(&cpuhp_state_mutex); if (ret) - return ret; + goto out; if (st->state < target) ret = do_cpu_up(dev->id, target); else ret = do_cpu_down(dev->id, target); - +out: unlock_device_hotplug(); return ret ? ret : count; } -- cgit v1.2.3 From f4781e76f90df7aec400635d73ea4c35ee1d4765 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:34 +0200 Subject: alarmtimer: Prevent overflow of relative timers Andrey reported a alartimer related RCU stall while fuzzing the kernel with syzkaller. The reason for this is an overflow in ktime_add() which brings the resulting time into negative space and causes immediate expiry of the timer. The following rearm with a small interval does not bring the timer back into positive space due to the same issue. This results in a permanent firing alarmtimer which hogs the CPU. Use ktime_add_safe() instead which detects the overflow and clamps the result to KTIME_SEC_MAX. Reported-by: Andrey Konovalov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Kostya Serebryany Cc: syzkaller Cc: John Stultz Cc: Dmitry Vyukov Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170530211655.802921648@linutronix.de --- kernel/time/alarmtimer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5cb5b0008d97..2b2e032b4eb4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -387,7 +387,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add(start, base->gettime()); + start = ktime_add_safe(start, base->gettime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -475,7 +475,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) overrun++; } - alarm->node.expires = ktime_add(alarm->node.expires, interval); + alarm->node.expires = ktime_add_safe(alarm->node.expires, interval); return overrun; } EXPORT_SYMBOL_GPL(alarm_forward); @@ -666,7 +666,7 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, ktime_t now; now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); - exp = ktime_add(now, exp); + exp = ktime_add_safe(now, exp); } alarm_start(&timr->it.alarm.alarmtimer, exp); -- cgit v1.2.3 From ff86bf0c65f14346bf2440534f9ba5ac232c39a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:35 +0200 Subject: alarmtimer: Rate limit periodic intervals The alarmtimer code has another source of potentially rearming itself too fast. Interval timers with a very samll interval have a similar CPU hog effect as the previously fixed overflow issue. The reason is that alarmtimers do not implement the normal protection against this kind of problem which the other posix timer use: timer expires -> queue signal -> deliver signal -> rearm timer This scheme brings the rearming under scheduler control and prevents permanently firing timers which hog the CPU. Bringing this scheme to the alarm timer code is a major overhaul because it lacks all the necessary mechanisms completely. So for a quick fix limit the interval to one jiffie. This is not problematic in practice as alarmtimers are usually backed by an RTC for suspend which have 1 second resolution. It could be therefor argued that the resolution of this clock should be set to 1 second in general, but that's outside the scope of this fix. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Kostya Serebryany Cc: syzkaller Cc: John Stultz Cc: Dmitry Vyukov Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170530211655.896767100@linutronix.de --- kernel/time/alarmtimer.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2b2e032b4eb4..ee2f4202d82a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -660,6 +660,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, /* start the timer */ timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); + + /* + * Rate limit to the tick as a hot fix to prevent DOS. Will be + * mopped up later. + */ + if (timr->it.alarm.interval < TICK_NSEC) + timr->it.alarm.interval = TICK_NSEC; + exp = timespec64_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { -- cgit v1.2.3 From f3b7eaae1b35eb8077610eb7c7db042c9b0645e1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 7 Jun 2017 00:57:37 +0200 Subject: Revert "ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle" Revert commit eed4d47efe95 (ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle) as it turned out to be premature and triggered a number of different issues on various systems. That includes, but is not limited to, premature suspend-to-RAM aborts on Dell XPS 13 (9343) reported by Dominik. The issue the commit in question attempted to address is real and will need to be taken care of going forward, but evidently more work is needed for this purpose. Reported-by: Dominik Brodowski Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 2 +- kernel/power/suspend.c | 29 ++++------------------------- 2 files changed, 5 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 78672d324a6e..c7209f060eeb 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -132,7 +132,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(true); + pm_wakeup_clear(); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c0248c74d6d4..15e6baef5c73 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -72,8 +72,6 @@ static void freeze_begin(void) static void freeze_enter(void) { - trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); - spin_lock_irq(&suspend_freeze_lock); if (pm_wakeup_pending()) goto out; @@ -100,27 +98,6 @@ static void freeze_enter(void) out: suspend_freeze_state = FREEZE_STATE_NONE; spin_unlock_irq(&suspend_freeze_lock); - - trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); -} - -static void s2idle_loop(void) -{ - do { - freeze_enter(); - - if (freeze_ops && freeze_ops->wake) - freeze_ops->wake(); - - dpm_resume_noirq(PMSG_RESUME); - if (freeze_ops && freeze_ops->sync) - freeze_ops->sync(); - - if (pm_wakeup_pending()) - break; - - pm_wakeup_clear(false); - } while (!dpm_suspend_noirq(PMSG_SUSPEND)); } void freeze_wake(void) @@ -394,8 +371,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { - s2idle_loop(); - goto Platform_early_resume; + trace_suspend_resume(TPS("machine_suspend"), state, true); + freeze_enter(); + trace_suspend_resume(TPS("machine_suspend"), state, false); + goto Platform_wake; } error = disable_nonboot_cpus(); -- cgit v1.2.3 From cc1582c231ea041fbc68861dfaf957eaf902b829 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Thu, 25 May 2017 18:09:07 +0800 Subject: perf/core: Drop kernel samples even though :u is specified When doing sampling, for example: perf record -e cycles:u ... On workloads that do a lot of kernel entry/exits we see kernel samples, even though :u is specified. This is due to skid existing. This might be a security issue because it can leak kernel addresses even though kernel sampling support is disabled. The patch drops the kernel samples if exclude_kernel is specified. For example, test on Haswell desktop: perf record -e cycles:u perf report --stdio Before patch applied: 99.77% mgen mgen [.] buf_read 0.20% mgen mgen [.] rand_buf_init 0.01% mgen [kernel.vmlinux] [k] apic_timer_interrupt 0.00% mgen mgen [.] last_free_elem 0.00% mgen libc-2.23.so [.] __random_r 0.00% mgen libc-2.23.so [.] _int_malloc 0.00% mgen mgen [.] rand_array_init 0.00% mgen [kernel.vmlinux] [k] page_fault 0.00% mgen libc-2.23.so [.] __random 0.00% mgen libc-2.23.so [.] __strcasestr 0.00% mgen ld-2.23.so [.] strcmp 0.00% mgen ld-2.23.so [.] _dl_start 0.00% mgen libc-2.23.so [.] sched_setaffinity@@GLIBC_2.3.4 0.00% mgen ld-2.23.so [.] _start We can see kernel symbols apic_timer_interrupt and page_fault. After patch applied: 99.79% mgen mgen [.] buf_read 0.19% mgen mgen [.] rand_buf_init 0.00% mgen libc-2.23.so [.] __random_r 0.00% mgen mgen [.] rand_array_init 0.00% mgen mgen [.] last_free_elem 0.00% mgen libc-2.23.so [.] vfprintf 0.00% mgen libc-2.23.so [.] rand 0.00% mgen libc-2.23.so [.] __random 0.00% mgen libc-2.23.so [.] _int_malloc 0.00% mgen libc-2.23.so [.] _IO_doallocbuf 0.00% mgen ld-2.23.so [.] do_lookup_x 0.00% mgen ld-2.23.so [.] open_verify.constprop.7 0.00% mgen ld-2.23.so [.] _dl_important_hwcaps 0.00% mgen libc-2.23.so [.] sched_setaffinity@@GLIBC_2.3.4 0.00% mgen ld-2.23.so [.] _start There are only userspace symbols. Signed-off-by: Jin Yao Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Cc: kan.liang@intel.com Cc: mark.rutland@arm.com Cc: will.deacon@arm.com Cc: yao.jin@intel.com Link: http://lkml.kernel.org/r/1495706947-3744-1-git-send-email-yao.jin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..6c4e523dc1e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7316,6 +7316,21 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + * To avoid leaking information to userspace, we must always + * reject kernel samples when exclude_kernel is set. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + /* * Generic event overflow handling, sampling. */ @@ -7336,6 +7351,12 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); + /* + * For security, drop the skid kernel samples if necessary. + */ + if (!sample_is_allowed(event, regs)) + return ret; + /* * XXX event_limit might not quite work as expected on inherited * events -- cgit v1.2.3 From dac8bbbae1d0ccba96402d25deeed3a2e87992c6 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 8 Jun 2017 12:01:30 +0200 Subject: Revert "printk: fix double printing with earlycon" This reverts commit cf39bf58afdaabc0b86f141630fb3fd18190294e. The commit regression to users that define both console=ttyS1 and console=ttyS0 on the command line, see https://lkml.kernel.org/r/20170509082915.GA13236@bistromath.localdomain The kernel log messages always appeared only on one serial port. It is even documented in Documentation/admin-guide/serial-console.rst: "Note that you can only define one console per device type (serial, video)." The above mentioned commit changed the order in which the command line parameters are searched. As a result, the kernel log messages go to the last mentioned ttyS* instead of the first one. We long thought that using two console=ttyS* on the command line did not make sense. But then we realized that console= parameters were handled also by systemd, see http://0pointer.de/blog/projects/serial-console.html "By default systemd will instantiate one serial-getty@.service on the main kernel console, if it is not a virtual terminal." where "[4] If multiple kernel consoles are used simultaneously, the main console is the one listed first in /sys/class/tty/console/active, which is the last one listed on the kernel command line." This puts the original report into another light. The system is running in qemu. The first serial port is used to store the messages into a file. The second one is used to login to the system via a socket. It depends on systemd and the historic kernel behavior. By other words, systemd causes that it makes sense to define both console=ttyS1 console=ttyS0 on the command line. The kernel fix caused regression related to userspace (systemd) and need to be reverted. In addition, it went out that the fix helped only partially. The messages still were duplicated when the boot console was removed early by late_initcall(printk_late_init). Then the entire log was replayed when the same console was registered as a normal one. Link: 20170606160339.GC7604@pathway.suse.cz Cc: Aleksey Makarov Cc: Sabrina Dubroca Cc: Sudeep Holla Cc: Greg Kroah-Hartman Cc: Peter Hurley Cc: Jiri Slaby Cc: Robin Murphy , Cc: Steven Rostedt Cc: "Nair, Jayachandran" Cc: linux-serial@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reported-by: Sabrina Dubroca Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 46 ++++++++++------------------------------------ 1 file changed, 10 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 779479ac9f57..9dbceb76e6bc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -269,7 +269,6 @@ static struct console *exclusive_console; #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; -static int console_cmdline_cnt; static int preferred_console = -1; int console_set_on_cmdline; @@ -1906,25 +1905,12 @@ static int __add_preferred_console(char *name, int idx, char *options, * See if this tty is not yet registered, and * if we have a slot free. */ - for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) { + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { if (strcmp(c->name, name) == 0 && c->index == idx) { - if (brl_options) - return 0; - - /* - * Maintain an invariant that will help to find if - * the matching console is preferred, see - * register_console(): - * - * The last non-braille console is always - * the preferred one. - */ - if (i != console_cmdline_cnt - 1) - swap(console_cmdline[i], - console_cmdline[console_cmdline_cnt - 1]); - - preferred_console = console_cmdline_cnt - 1; - + if (!brl_options) + preferred_console = i; return 0; } } @@ -1937,7 +1923,6 @@ static int __add_preferred_console(char *name, int idx, char *options, braille_set_options(c, brl_options); c->index = idx; - console_cmdline_cnt++; return 0; } /* @@ -2477,23 +2462,12 @@ void register_console(struct console *newcon) } /* - * See if this console matches one we selected on the command line. - * - * There may be several entries in the console_cmdline array matching - * with the same console, one with newcon->match(), another by - * name/index: - * - * pl011,mmio,0x87e024000000,115200 -- added from SPCR - * ttyAMA0 -- added from command line - * - * Traverse the console_cmdline array in reverse order to be - * sure that if this console is preferred then it will be the first - * matching entry. We use the invariant that is maintained in - * __add_preferred_console(). + * See if this console matches one we selected on + * the command line. */ - for (i = console_cmdline_cnt - 1; i >= 0; i--) { - c = console_cmdline + i; - + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ -- cgit v1.2.3 From cdf7abc4610a7f1c43d06cda246c5f748a4fd267 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 May 2017 14:03:10 +0200 Subject: srcu: Allow use of Tiny/Tree SRCU from both process and interrupt context Linu Cherian reported a WARN in cleanup_srcu_struct() when shutting down a guest running iperf on a VFIO assigned device. This happens because irqfd_wakeup() calls srcu_read_lock(&kvm->irq_srcu) in interrupt context, while a worker thread does the same inside kvm_set_irq(). If the interrupt happens while the worker thread is executing __srcu_read_lock(), updates to the Classic SRCU ->lock_count[] field or the Tree SRCU ->srcu_lock_count[] field can be lost. The docs say you are not supposed to call srcu_read_lock() and srcu_read_unlock() from irq context, but KVM interrupt injection happens from (host) interrupt context and it would be nice if SRCU supported the use case. KVM is using SRCU here not really for the "sleepable" part, but rather due to its IPI-free fast detection of grace periods. It is therefore not desirable to switch back to RCU, which would effectively revert commit 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING", 2014-01-16). However, the docs are overly conservative. You can have an SRCU instance only has users in irq context, and you can mix process and irq context as long as process context users disable interrupts. In addition, __srcu_read_unlock() actually uses this_cpu_dec() on both Tree SRCU and Classic SRCU. For those two implementations, only srcu_read_lock() is unsafe. When Classic SRCU's __srcu_read_unlock() was changed to use this_cpu_dec(), in commit 5a41344a3d83 ("srcu: Simplify __srcu_read_unlock() via this_cpu_dec()", 2012-11-29), __srcu_read_lock() did two increments. Therefore it kept __this_cpu_inc(), with preempt_disable/enable in the caller. Tree SRCU however only does one increment, so on most architectures it is more efficient for __srcu_read_lock() to use this_cpu_inc(), and any performance differences appear to be down in the noise. Unlike Classic and Tree SRCU, Tiny SRCU does increments and decrements on a single variable. Therefore, as Peter Zijlstra pointed out, Tiny SRCU's implementation already supports mixed-context use of srcu_read_lock() and srcu_read_unlock(), at least as long as uses of srcu_read_lock() and srcu_read_unlock() in each handler are nested and paired properly. In other words, it is still illegal to (say) invoke srcu_read_lock() in an interrupt handler and to invoke the matching srcu_read_unlock() in a softirq handler. Therefore, the only change required for Tiny SRCU is to its comments. Fixes: 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING") Reported-by: Linu Cherian Suggested-by: Linu Cherian Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini Cc: Linus Torvalds Signed-off-by: Paul E. McKenney Tested-by: Paolo Bonzini --- kernel/rcu/srcutiny.c | 7 ++++--- kernel/rcu/srcutree.c | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 36e1f82faed1..32798eb14853 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -97,8 +97,9 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. - * Returns an index that must be passed to the matching srcu_read_unlock(). + * srcu_struct. Can be invoked from irq/bh handlers, but the matching + * __srcu_read_unlock() must be in the same handler instance. Returns an + * index that must be passed to the matching srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *sp) { @@ -112,7 +113,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); /* * Removes the count for the old reader from the appropriate element of - * the srcu_struct. Must be called from process context. + * the srcu_struct. */ void __srcu_read_unlock(struct srcu_struct *sp, int idx) { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3ae8474557df..157654fa436a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -357,7 +357,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. + * srcu_struct. * Returns an index that must be passed to the matching srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *sp) @@ -365,7 +365,7 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->srcu_idx) & 0x1; - __this_cpu_inc(sp->sda->srcu_lock_count[idx]); + this_cpu_inc(sp->sda->srcu_lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -375,7 +375,6 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); * Removes the count for the old reader from the appropriate per-CPU * element of the srcu_struct. Note that this may well be a different * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context. */ void __srcu_read_unlock(struct srcu_struct *sp, int idx) { -- cgit v1.2.3 From 1123a6041654e8f889014659593bad4168e542c2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 May 2017 14:03:11 +0200 Subject: srcu: Allow use of Classic SRCU from both process and interrupt context Linu Cherian reported a WARN in cleanup_srcu_struct() when shutting down a guest running iperf on a VFIO assigned device. This happens because irqfd_wakeup() calls srcu_read_lock(&kvm->irq_srcu) in interrupt context, while a worker thread does the same inside kvm_set_irq(). If the interrupt happens while the worker thread is executing __srcu_read_lock(), updates to the Classic SRCU ->lock_count[] field or the Tree SRCU ->srcu_lock_count[] field can be lost. The docs say you are not supposed to call srcu_read_lock() and srcu_read_unlock() from irq context, but KVM interrupt injection happens from (host) interrupt context and it would be nice if SRCU supported the use case. KVM is using SRCU here not really for the "sleepable" part, but rather due to its IPI-free fast detection of grace periods. It is therefore not desirable to switch back to RCU, which would effectively revert commit 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING", 2014-01-16). However, the docs are overly conservative. You can have an SRCU instance only has users in irq context, and you can mix process and irq context as long as process context users disable interrupts. In addition, __srcu_read_unlock() actually uses this_cpu_dec() on both Tree SRCU and Classic SRCU. For those two implementations, only srcu_read_lock() is unsafe. When Classic SRCU's __srcu_read_unlock() was changed to use this_cpu_dec(), in commit 5a41344a3d83 ("srcu: Simplify __srcu_read_unlock() via this_cpu_dec()", 2012-11-29), __srcu_read_lock() did two increments. Therefore it kept __this_cpu_inc(), with preempt_disable/enable in the caller. Tree SRCU however only does one increment, so on most architectures it is more efficient for __srcu_read_lock() to use this_cpu_inc(), and any performance differences appear to be down in the noise. Cc: stable@vger.kernel.org Fixes: 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING") Reported-by: Linu Cherian Suggested-by: Linu Cherian Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini Cc: Linus Torvalds Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 584d8a983883..dea03614263f 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -263,7 +263,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. + * srcu_struct. * Returns an index that must be passed to the matching srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *sp) @@ -271,7 +271,7 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); + this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -281,7 +281,6 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); * Removes the count for the old reader from the appropriate per-CPU * element of the srcu_struct. Note that this may well be a different * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context. */ void __srcu_read_unlock(struct srcu_struct *sp, int idx) { -- cgit v1.2.3 From f67abed585efe251edda52dc9690020d6441890f Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Fri, 9 Jun 2017 10:00:29 +0200 Subject: sched/fair: Fix typo in printk message 'schedstats' kernel parameter should be set to enable/disable, so correct the printk hint saying that it should be set to 'enable' rather than 'enabled' to enable scheduler tracepoints. Signed-off-by: Marcin Nowakowski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1496995229-31245-1-git-send-email-marcin.nowakowski@imgtec.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d71109321841..c77e4b1d51c0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3563,7 +3563,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_runtime_enabled()) { printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enabled or " + "kernel parameter schedstats=enable or " "kernel.sched_schedstats=1\n"); } #endif -- cgit v1.2.3 From 252d2a4117bc181b287eeddf848863788da733ae Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 9 Jun 2017 11:49:15 -0700 Subject: sched/core: Idle_task_exit() shouldn't use switch_mm_irqs_off() idle_task_exit() can be called with IRQs on x86 on and therefore should use switch_mm(), not switch_mm_irqs_off(). This doesn't seem to cause any problems right now, but it will confuse my upcoming TLB flush changes. Nonetheless, I think it should be backported because it's trivial. There won't be any meaningful performance impact because idle_task_exit() is only used when offlining a CPU. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: f98db6013c55 ("sched/core: Add switch_mm_irqs_off() and use it in the scheduler") Link: http://lkml.kernel.org/r/ca3d1a9fa93a0b49f5a8ff729eda3640fb6abdf9.1497034141.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 803c3bc274c4..326d4f88e2b1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5605,7 +5605,7 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); if (mm != &init_mm) { - switch_mm_irqs_off(mm, &init_mm, current); + switch_mm(mm, &init_mm, current); finish_arch_post_lock_switch(); } mmdrop(mm); -- cgit v1.2.3 From ff0a6d6f932ff4b9eb8e8140f98cc1cf763d0d78 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Jun 2017 14:16:16 +0200 Subject: Revert "cpufreq: schedutil: Reduce frequencies slower" Revert commit 39b64aa1c007 (cpufreq: schedutil: Reduce frequencies slower) that introduced unintentional changes in behavior leading to adverse effects on some systems. Reported-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 76877a62b5fa..8773d1efdfab 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -101,9 +101,6 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (sg_policy->next_freq == next_freq) return; - if (sg_policy->next_freq > next_freq) - next_freq = (sg_policy->next_freq + next_freq) >> 1; - sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; -- cgit v1.2.3 From 94114c367553f3301747e47f6947cabde947575f Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 7 Jun 2017 23:36:03 -0700 Subject: tick/broadcast: Make tick_broadcast_setup_oneshot() static This function isn't used outside of tick-broadcast.c, so let's mark it static. Signed-off-by: Stephen Boyd Link: http://lkml.kernel.org/r/20170608063603.13276-1-sboyd@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 +++- kernel/time/tick-internal.h | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 987e496bb51a..b398c2ea69b2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -37,9 +37,11 @@ static int tick_broadcast_forced; static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); #else +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } #endif @@ -867,7 +869,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, /** * tick_broadcast_setup_oneshot - setup the broadcast device */ -void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { int cpu = smp_processor_id(); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f738251000fe..be0ac01f2e12 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -126,7 +126,6 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } /* Functions related to oneshot broadcasting */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) -extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); @@ -134,7 +133,6 @@ extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* !(BROADCAST && ONESHOT): */ -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } -- cgit v1.2.3 From fa07ab72cbb0d843429e61bf179308aed6cbe0dd Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 11 Jun 2017 00:38:36 +0200 Subject: genirq: Release resources in __setup_irq() error path In case __irq_set_trigger() fails the resources requested via irq_request_resources() are not released. Add the missing release call into the error handling path. Fixes: c1bacbae8192 ("genirq: Provide irq_request/release_resources chip callbacks") Signed-off-by: Heiner Kallweit Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/655538f5-cb20-a892-ff15-fbd2dd1fa4ec@gmail.com --- kernel/irq/manage.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 070be980c37a..425170d4439b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1312,8 +1312,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) + if (ret) { + irq_release_resources(desc); goto out_mask; + } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ -- cgit v1.2.3