summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/capability.c20
-rw-r--r--kernel/cpuset.c32
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/events/core.c139
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/jump_label.c7
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/ptrace.c44
-rw-r--r--kernel/sched/core.c22
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/time/timekeeping.c32
-rw-r--r--kernel/trace/trace.c29
-rw-r--r--kernel/trace/trace_functions_graph.c17
13 files changed, 264 insertions, 96 deletions
diff --git a/kernel/capability.c b/kernel/capability.c
index 45432b54d5c6..022df097a6bc 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -447,3 +447,23 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
kgid_has_mapping(ns, inode->i_gid);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
+
+/**
+ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
+ * @tsk: The task that may be ptraced
+ * @ns: The user namespace to search for CAP_SYS_PTRACE in
+ *
+ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
+ * in the specified user namespace.
+ */
+bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
+{
+ int ret = 0; /* An absent tracer adds no restrictions */
+ const struct cred *cred;
+ rcu_read_lock();
+ cred = rcu_dereference(tsk->ptracer_cred);
+ if (cred)
+ ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+ rcu_read_unlock();
+ return (ret == 0);
+}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f0acff0f66c9..71403502411b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -323,8 +323,7 @@ static struct file_system_type cpuset_fs_type = {
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. The top
- * cpuset always has some cpus online.
+ * until we find one that does have some online cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
@@ -333,8 +332,20 @@ static struct file_system_type cpuset_fs_type = {
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
cs = parent_cs(cs);
+ if (unlikely(!cs)) {
+ /*
+ * The top cpuset doesn't have any online cpu as a
+ * consequence of a race between cpuset_hotplug_work
+ * and cpu hotplug notifier. But we know the top
+ * cpuset's effective_cpus is on its way to to be
+ * identical to cpu_online_mask.
+ */
+ cpumask_copy(pmask, cpu_online_mask);
+ return;
+ }
+ }
cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
@@ -2042,6 +2053,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_unlock(&cpuset_mutex);
}
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+void cpuset_fork(struct task_struct *task)
+{
+ if (task_css_is_root(task, cpuset_cgrp_id))
+ return;
+
+ set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ task->mems_allowed = current->mems_allowed;
+}
+
struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
@@ -2051,6 +2076,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
.bind = cpuset_bind,
+ .fork = cpuset_fork,
.legacy_cftypes = files,
.early_init = 1,
};
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..79517e5549f1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -598,11 +598,11 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- time_left = loops_per_jiffy * HZ;
+ time_left = MSEC_PER_SEC;
while (kgdb_do_roundup && --time_left &&
(atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
online_cpus)
- cpu_relax();
+ udelay(1000);
if (!time_left)
pr_crit("Timed out waiting for secondary CPUs.\n");
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6da64f0d0630..34d4db5cd984 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -198,8 +198,11 @@ void update_perf_cpu_limits(void)
u64 tmp = perf_sample_period_ns;
tmp *= sysctl_perf_cpu_time_max_percent;
- do_div(tmp, 100);
- ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+ tmp = div_u64(tmp, 100);
+ if (!tmp)
+ tmp = 1;
+
+ WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -213,6 +216,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;
+ /*
+ * If throttling is disabled don't allow the write:
+ */
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0)
+ return -EINVAL;
+
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
@@ -226,12 +236,19 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
- update_perf_cpu_limits();
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0) {
+ printk(KERN_WARNING
+ "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
+ WRITE_ONCE(perf_sample_allowed_ns, 0);
+ } else {
+ update_perf_cpu_limits();
+ }
return 0;
}
@@ -245,62 +262,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);
+static u64 __report_avg;
+static u64 __report_allowed;
+
static void perf_duration_warn(struct irq_work *w)
{
- u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
- u64 avg_local_sample_len;
- u64 local_samples_len;
-
- local_samples_len = __this_cpu_read(running_sample_length);
- avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
printk_ratelimited(KERN_WARNING
- "perf interrupt took too long (%lld > %lld), lowering "
- "kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns >> 1,
- sysctl_perf_event_sample_rate);
+ "perf: interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ __report_avg, __report_allowed,
+ sysctl_perf_event_sample_rate);
}
static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
void perf_sample_event_took(u64 sample_len_ns)
{
- u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
- u64 avg_local_sample_len;
- u64 local_samples_len;
+ u64 max_len = READ_ONCE(perf_sample_allowed_ns);
+ u64 running_len;
+ u64 avg_len;
+ u32 max;
- if (allowed_ns == 0)
+ if (max_len == 0)
return;
- /* decay the counter by 1 average sample */
- local_samples_len = __this_cpu_read(running_sample_length);
- local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
- local_samples_len += sample_len_ns;
- __this_cpu_write(running_sample_length, local_samples_len);
+ /* Decay the counter by 1 average sample. */
+ running_len = __this_cpu_read(running_sample_length);
+ running_len -= running_len/NR_ACCUMULATED_SAMPLES;
+ running_len += sample_len_ns;
+ __this_cpu_write(running_sample_length, running_len);
/*
- * note: this will be biased artifically low until we have
- * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
+ * Note: this will be biased artifically low until we have
+ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
* from having to maintain a count.
*/
- avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
- if (avg_local_sample_len <= allowed_ns)
+ avg_len = running_len/NR_ACCUMULATED_SAMPLES;
+ if (avg_len <= max_len)
return;
- if (max_samples_per_tick <= 1)
- return;
+ __report_avg = avg_len;
+ __report_allowed = max_len;
- max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
- sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
- perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+ /*
+ * Compute a throttle threshold 25% below the current duration.
+ */
+ avg_len += avg_len / 4;
+ max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
+ if (avg_len < max)
+ max /= (u32)avg_len;
+ else
+ max = 1;
- update_perf_cpu_limits();
+ WRITE_ONCE(perf_sample_allowed_ns, avg_len);
+ WRITE_ONCE(max_samples_per_tick, max);
+
+ sysctl_perf_event_sample_rate = max * HZ;
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
if (!irq_work_queue(&perf_duration_work)) {
- early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ early_printk("perf: interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns >> 1,
+ __report_avg, __report_allowed,
sysctl_perf_event_sample_rate);
}
}
@@ -5833,6 +5856,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
char *buf = NULL;
char *name;
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
if (file) {
struct inode *inode;
dev_t dev;
@@ -5859,27 +5903,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
maj = MAJOR(dev);
min = MINOR(dev);
- if (vma->vm_flags & VM_READ)
- prot |= PROT_READ;
- if (vma->vm_flags & VM_WRITE)
- prot |= PROT_WRITE;
- if (vma->vm_flags & VM_EXEC)
- prot |= PROT_EXEC;
-
- if (vma->vm_flags & VM_MAYSHARE)
- flags = MAP_SHARED;
- else
- flags = MAP_PRIVATE;
-
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
- if (vma->vm_flags & VM_MAYEXEC)
- flags |= MAP_EXECUTABLE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
- flags |= MAP_HUGETLB;
-
goto got_name;
} else {
if (vma->vm_ops && vma->vm_ops->name) {
diff --git a/kernel/futex.c b/kernel/futex.c
index 2214b70f1910..01ee2e8859c4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3074,4 +3074,4 @@ static int __init futex_init(void)
return 0;
}
-__initcall(futex_init);
+core_initcall(futex_init);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 9019f15deab2..7d4d0a917d13 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -116,6 +116,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+void static_key_deferred_flush(struct static_key_deferred *key)
+{
+ STATIC_KEY_CHECK_USE();
+ flush_delayed_work(&key->work);
+}
+EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 3c1aca0c3543..59dc17ba820b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1394,7 +1394,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
{
struct console *con;
- trace_console(text, len);
+ trace_console_rcuidle(text, len);
if (level >= console_loglevel && !ignore_loglevel)
return;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 261ee21e62db..c67aab541ee2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -20,6 +20,7 @@
#include <linux/uio.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/regset.h>
@@ -39,6 +40,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
BUG_ON(!list_empty(&child->ptrace_entry));
list_add(&child->ptrace_entry, &new_parent->ptraced);
child->parent = new_parent;
+ rcu_read_lock();
+ child->ptracer_cred = get_cred(__task_cred(new_parent));
+ rcu_read_unlock();
}
/**
@@ -71,11 +75,15 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
*/
void __ptrace_unlink(struct task_struct *child)
{
+ const struct cred *old_cred;
BUG_ON(!child->ptrace);
child->ptrace = 0;
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
+ old_cred = child->ptracer_cred;
+ child->ptracer_cred = NULL;
+ put_cred(old_cred);
spin_lock(&child->sighand->siglock);
@@ -207,12 +215,34 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
return ret;
}
-static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+static bool ptrace_has_cap(const struct cred *tcred, unsigned int mode)
{
+ struct user_namespace *tns = tcred->user_ns;
+
+ /* When a root-owned process enters a user namespace created by a
+ * malicious user, the user shouldn't be able to execute code under
+ * uid 0 by attaching to the root-owned process via ptrace.
+ * Therefore, similar to the capable_wrt_inode_uidgid() check,
+ * verify that all the uids and gids of the target process are
+ * mapped into a namespace below the current one in which the caller
+ * is capable.
+ * No fsuid/fsgid check because __ptrace_may_access doesn't do it
+ * either.
+ */
+ while (
+ !kuid_has_mapping(tns, tcred->euid) ||
+ !kuid_has_mapping(tns, tcred->suid) ||
+ !kuid_has_mapping(tns, tcred->uid) ||
+ !kgid_has_mapping(tns, tcred->egid) ||
+ !kgid_has_mapping(tns, tcred->sgid) ||
+ !kgid_has_mapping(tns, tcred->gid)) {
+ tns = tns->parent;
+ }
+
if (mode & PTRACE_MODE_NOAUDIT)
- return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+ return has_ns_capability_noaudit(current, tns, CAP_SYS_PTRACE);
else
- return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+ return has_ns_capability(current, tns, CAP_SYS_PTRACE);
}
/* Returns 0 on success, -errno on denial. */
@@ -264,7 +294,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
gid_eq(caller_gid, tcred->sgid) &&
gid_eq(caller_gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(tcred->user_ns, mode))
+ if (ptrace_has_cap(tcred, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -275,7 +305,7 @@ ok:
dumpable = get_dumpable(task->mm);
rcu_read_lock();
if (dumpable != SUID_DUMP_USER &&
- !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+ !ptrace_has_cap(__task_cred(task), mode)) {
rcu_read_unlock();
return -EPERM;
}
@@ -343,10 +373,6 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize)
flags |= PT_SEIZED;
- rcu_read_lock();
- if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
- flags |= PT_PTRACE_CAP;
- rcu_read_unlock();
task->ptrace = flags;
__ptrace_link(task, current);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98e607121d09..6cb5f00696f5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1672,6 +1672,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * [S] p->on_rq = 1; [L] P->state
+ * UNLOCK rq->lock -----.
+ * \
+ * +--- RMB
+ * schedule() /
+ * LOCK rq->lock -----'
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+ *
+ * Pairs with the UNLOCK+LOCK on rq->lock from the
+ * last wakeup of our task and the schedule that got our task
+ * current.
+ */
+ smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7d4900404c94..1431089b8a67 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
#include <linux/kexec.h>
+#include <linux/mount.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -1709,6 +1710,14 @@ static struct ctl_table fs_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+ {
+ .procname = "mount-max",
+ .data = &sysctl_mount_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
{ }
};
@@ -2340,6 +2349,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
break;
if (neg)
continue;
+ val = convmul * val / convdiv;
if ((min && val < *min) || (max && val > *max))
continue;
*i = val;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8fcc801fde15..d296b904685b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -309,17 +309,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
+static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+ cycle_t delta)
+{
+ u64 nsec;
+
+ nsec = delta * tkr->mult + tkr->xtime_nsec;
+ nsec >>= tkr->shift;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + arch_gettimeoffset();
+}
+
static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
cycle_t delta;
- s64 nsec;
delta = timekeeping_get_delta(tkr);
+ return timekeeping_delta_to_ns(tkr, delta);
+}
- nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift;
+static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+ cycle_t cycles)
+{
+ cycle_t delta;
- /* If arch requires, add in get_arch_timeoffset() */
- return nsec + arch_gettimeoffset();
+ /* calculate the delta since the last update_wall_time */
+ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+ return timekeeping_delta_to_ns(tkr, delta);
}
/**
@@ -421,8 +438,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
- now += clocksource_delta(tkr->read(tkr->clock),
- tkr->cycle_last, tkr->mask);
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tkr->read(tkr->clock),
+ tkr->cycle_last,
+ tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
return now;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index de6ea94c41bb..61ea7e8cdde5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4694,19 +4694,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
struct trace_iterator *iter = filp->private_data;
ssize_t sret;
- /* return any leftover data */
- sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (sret != -EBUSY)
- return sret;
-
- trace_seq_init(&iter->seq);
-
/*
* Avoid more than one consumer on a single file descriptor
* This is just a matter of traces coherency, the ring buffer itself
* is protected.
*/
mutex_lock(&iter->mutex);
+
+ /* return any leftover data */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (sret != -EBUSY)
+ goto out;
+
+ trace_seq_init(&iter->seq);
+
if (iter->trace->read) {
sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
if (sret)
@@ -5731,9 +5732,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
if (*ppos & (PAGE_SIZE - 1))
return -EINVAL;
@@ -5743,6 +5741,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
len &= PAGE_MASK;
}
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
again:
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -5800,19 +5801,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
if (ret)
- return ret;
+ goto out;
+ ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
- return -EAGAIN;
+ goto out;
ret = wait_on_pipe(iter, true);
if (ret)
- return ret;
+ goto out;
goto again;
}
ret = splice_to_pipe(pipe, &spd);
+out:
splice_shrink_spd(&spd);
return ret;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a51e79688455..972ce5b596f4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -779,6 +779,10 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+ /* If a graph tracer ignored set_graph_notrace */
+ if (call->depth < -1)
+ call->depth += FTRACE_NOTRACE_DEPTH;
+
/*
* Comments display at + 1 to depth. Since
* this is a leaf function, keep the comments
@@ -787,7 +791,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data->depth = call->depth - 1;
/* No need to keep this function around for this depth */
- if (call->depth < FTRACE_RETFUNC_DEPTH)
+ if (call->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(call->depth < 0))
cpu_data->enter_funcs[call->depth] = 0;
}
@@ -816,11 +821,16 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct fgraph_cpu_data *cpu_data;
int cpu = iter->cpu;
+ /* If a graph tracer ignored set_graph_notrace */
+ if (call->depth < -1)
+ call->depth += FTRACE_NOTRACE_DEPTH;
+
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
cpu_data->depth = call->depth;
/* Save this function pointer to see if the exit matches */
- if (call->depth < FTRACE_RETFUNC_DEPTH)
+ if (call->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(call->depth < 0))
cpu_data->enter_funcs[call->depth] = call->func;
}
@@ -1048,7 +1058,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
*/
cpu_data->depth = trace->depth - 1;
- if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+ if (trace->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(trace->depth < 0)) {
if (cpu_data->enter_funcs[trace->depth] != trace->func)
func_match = 0;
cpu_data->enter_funcs[trace->depth] = 0;