From fdb5c4531c1e0e50e609df83f736b6f3a02896e2 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Tue, 19 Jun 2018 00:04:24 +0100
Subject: bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF

If the kernel is compiled with CONFIG_CGROUP_BPF not enabled, it is not
possible to attach, detach or query IR BPF programs to /dev/lircN devices,
making them impossible to use. For embedded devices, it should be possible
to use IR decoding without cgroups or CONFIG_CGROUP_BPF enabled.

This change requires some refactoring, since bpf_prog_{attach,detach,query}
functions are now always compiled, but their code paths for cgroups need
moving out. Rather than a #ifdef CONFIG_CGROUP_BPF in kernel/bpf/syscall.c,
moving them to kernel/bpf/cgroup.c and kernel/bpf/sockmap.c does not
require #ifdefs since that is already conditionally compiled.

Fixes: f4364dcfc86d ("media: rc: introduce BPF_PROG_LIRC_MODE2")
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/cgroup.c  | 54 ++++++++++++++++++++++++++++
 kernel/bpf/sockmap.c | 18 ++++++++++
 kernel/bpf/syscall.c | 99 +++++++++++-----------------------------------------
 3 files changed, 93 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index f7c00bd6f8e4..3d83ee7df381 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	return ret;
 }
 
+int cgroup_bpf_prog_attach(const union bpf_attr *attr,
+			   enum bpf_prog_type ptype, struct bpf_prog *prog)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+				attr->attach_flags);
+	cgroup_put(cgrp);
+	return ret;
+}
+
+int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
+{
+	struct bpf_prog *prog;
+	struct cgroup *cgrp;
+	int ret;
+
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		prog = NULL;
+
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	if (prog)
+		bpf_prog_put(prog);
+
+	cgroup_put(cgrp);
+	return ret;
+}
+
+int cgroup_bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	ret = cgroup_bpf_query(cgrp, attr, uattr);
+
+	cgroup_put(cgrp);
+	return ret;
+}
+
 /**
  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 52a91d816c0e..81d0c55a77aa 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1915,6 +1915,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 	return 0;
 }
 
+int sockmap_get_from_fd(const union bpf_attr *attr, int type,
+			struct bpf_prog *prog)
+{
+	int ufd = attr->target_fd;
+	struct bpf_map *map;
+	struct fd f;
+	int err;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = sock_map_prog(map, prog, attr->attach_type);
+	fdput(f);
+	return err;
+}
+
 static void *sock_map_lookup(struct bpf_map *map, void *key)
 {
 	return NULL;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35dc466641f2..d10ecd78105f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1483,8 +1483,6 @@ out_free_tp:
 	return err;
 }
 
-#ifdef CONFIG_CGROUP_BPF
-
 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 					     enum bpf_attach_type attach_type)
 {
@@ -1499,40 +1497,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 
 #define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
-static int sockmap_get_from_fd(const union bpf_attr *attr,
-			       int type, bool attach)
-{
-	struct bpf_prog *prog = NULL;
-	int ufd = attr->target_fd;
-	struct bpf_map *map;
-	struct fd f;
-	int err;
-
-	f = fdget(ufd);
-	map = __bpf_map_get(f);
-	if (IS_ERR(map))
-		return PTR_ERR(map);
-
-	if (attach) {
-		prog = bpf_prog_get_type(attr->attach_bpf_fd, type);
-		if (IS_ERR(prog)) {
-			fdput(f);
-			return PTR_ERR(prog);
-		}
-	}
-
-	err = sock_map_prog(map, prog, attr->attach_type);
-	if (err) {
-		fdput(f);
-		if (prog)
-			bpf_prog_put(prog);
-		return err;
-	}
-
-	fdput(f);
-	return 0;
-}
-
 #define BPF_F_ATTACH_MASK \
 	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
 
@@ -1540,7 +1504,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
 	struct bpf_prog *prog;
-	struct cgroup *cgrp;
 	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
@@ -1577,12 +1540,15 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
 		break;
 	case BPF_SK_MSG_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true);
+		ptype = BPF_PROG_TYPE_SK_MSG;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+		ptype = BPF_PROG_TYPE_SK_SKB;
+		break;
 	case BPF_LIRC_MODE2:
-		return lirc_prog_attach(attr);
+		ptype = BPF_PROG_TYPE_LIRC_MODE2;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1596,18 +1562,20 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return -EINVAL;
 	}
 
-	cgrp = cgroup_get_from_fd(attr->target_fd);
-	if (IS_ERR(cgrp)) {
-		bpf_prog_put(prog);
-		return PTR_ERR(cgrp);
+	switch (ptype) {
+	case BPF_PROG_TYPE_SK_SKB:
+	case BPF_PROG_TYPE_SK_MSG:
+		ret = sockmap_get_from_fd(attr, ptype, prog);
+		break;
+	case BPF_PROG_TYPE_LIRC_MODE2:
+		ret = lirc_prog_attach(attr, prog);
+		break;
+	default:
+		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 	}
 
-	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
-				attr->attach_flags);
 	if (ret)
 		bpf_prog_put(prog);
-	cgroup_put(cgrp);
-
 	return ret;
 }
 
@@ -1616,9 +1584,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
-	struct bpf_prog *prog;
-	struct cgroup *cgrp;
-	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -1651,29 +1616,17 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
 		break;
 	case BPF_SK_MSG_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false);
+		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
 
-	cgrp = cgroup_get_from_fd(attr->target_fd);
-	if (IS_ERR(cgrp))
-		return PTR_ERR(cgrp);
-
-	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
-	if (IS_ERR(prog))
-		prog = NULL;
-
-	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
-	if (prog)
-		bpf_prog_put(prog);
-	cgroup_put(cgrp);
-	return ret;
+	return cgroup_bpf_prog_detach(attr, ptype);
 }
 
 #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
@@ -1681,9 +1634,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 static int bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
 {
-	struct cgroup *cgrp;
-	int ret;
-
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_QUERY))
@@ -1711,14 +1661,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	default:
 		return -EINVAL;
 	}
-	cgrp = cgroup_get_from_fd(attr->query.target_fd);
-	if (IS_ERR(cgrp))
-		return PTR_ERR(cgrp);
-	ret = cgroup_bpf_query(cgrp, attr, uattr);
-	cgroup_put(cgrp);
-	return ret;
+
+	return cgroup_bpf_prog_query(attr, uattr);
 }
-#endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
 
@@ -2365,7 +2310,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET:
 		err = bpf_obj_get(&attr);
 		break;
-#ifdef CONFIG_CGROUP_BPF
 	case BPF_PROG_ATTACH:
 		err = bpf_prog_attach(&attr);
 		break;
@@ -2375,7 +2319,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_QUERY:
 		err = bpf_prog_query(&attr, uattr);
 		break;
-#endif
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
 		break;
-- 
cgit v1.2.3


From 9331510135640429711afbd0c810686100824a79 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Tue, 26 Jun 2018 22:23:00 +0200
Subject: perf/core: Move inline keyword at the beginning of declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix non-fatal warning triggered during compilation with W=1:

  kernel/events/core.c:6106:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration]
   static void __always_inline
   ^~~~~~

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180626202301.20270-1-malat@debian.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80cca2b30c4f..8f0434a9951a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6482,7 +6482,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 		data->phys_addr = perf_virt_to_phys(data->addr);
 }
 
-static void __always_inline
+static __always_inline void
 __perf_event_output(struct perf_event *event,
 		    struct perf_sample_data *data,
 		    struct pt_regs *regs,
-- 
cgit v1.2.3


From 210d0797c97d0e8f3b1a932a0dc143f4c57008a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Jun 2018 13:59:25 +0200
Subject: swiotlb: export swiotlb_dma_ops

For architectures that do not use per-device dma ops we need to export
the dma_map_ops structure returned from get_arch_dma_ops().

Fixes: 10314e09 ("riscv: add swiotlb support")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Andreas Schwab <schwab@suse.de>
---
 kernel/dma/swiotlb.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 04b68d9dffac..904541055792 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1085,3 +1085,4 @@ const struct dma_map_ops swiotlb_dma_ops = {
 	.unmap_page		= swiotlb_unmap_page,
 	.dma_supported		= dma_direct_supported,
 };
+EXPORT_SYMBOL(swiotlb_dma_ops);
-- 
cgit v1.2.3


From 85782e037f8aba8922dadb24a1523ca0b82ab8bc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 28 Jun 2018 23:34:59 +0200
Subject: bpf: undo prog rejection on read-only lock failure

Partially undo commit 9facc336876f ("bpf: reject any prog that failed
read-only lock") since it caused a regression, that is, syzkaller was
able to manage to cause a panic via fault injection deep in set_memory_ro()
path by letting an allocation fail: In x86's __change_page_attr_set_clr()
it was able to change the attributes of the primary mapping but not in
the alias mapping via cpa_process_alias(), so the second, inner call
to the __change_page_attr() via __change_page_attr_set_clr() had to split
a larger page and failed in the alloc_pages() with the artifically triggered
allocation error which is then propagated down to the call site.

Thus, for set_memory_ro() this means that it returned with an error, but
from debugging a probe_kernel_write() revealed EFAULT on that memory since
the primary mapping succeeded to get changed. Therefore the subsequent
hdr->locked = 0 reset triggered the panic as it was performed on read-only
memory, so call-site assumptions were infact wrong to assume that it would
either succeed /or/ not succeed at all since there's no such rollback in
set_memory_*() calls from partial change of mappings, in other words, we're
left in a state that is "half done". A later undo via set_memory_rw() is
succeeding though due to matching permissions on that part (aka due to the
try_preserve_large_page() succeeding). While reproducing locally with
explicitly triggering this error, the initial splitting only happens on
rare occasions and in real world it would additionally need oom conditions,
but that said, it could partially fail. Therefore, it is definitely wrong
to bail out on set_memory_ro() error and reject the program with the
set_memory_*() semantics we have today. Shouldn't have gone the extra mile
since no other user in tree today infact checks for any set_memory_*()
errors, e.g. neither module_enable_ro() / module_disable_ro() for module
RO/NX handling which is mostly default these days nor kprobes core with
alloc_insn_page() / free_insn_page() as examples that could be invoked long
after bootup and original 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit
against spraying attacks") did neither when it got first introduced to BPF
so "improving" with bailing out was clearly not right when set_memory_*()
cannot handle it today.

Kees suggested that if set_memory_*() can fail, we should annotate it with
__must_check, and all callers need to deal with it gracefully given those
set_memory_*() markings aren't "advisory", but they're expected to actually
do what they say. This might be an option worth to move forward in future
but would at the same time require that set_memory_*() calls from supporting
archs are guaranteed to be "atomic" in that they provide rollback if part
of the range fails, once that happened, the transition from RW -> RO could
be made more robust that way, while subsequent RO -> RW transition /must/
continue guaranteeing to always succeed the undo part.

Reported-by: syzbot+a4eb8c7766952a1ca872@syzkaller.appspotmail.com
Reported-by: syzbot+d866d1925855328eac3b@syzkaller.appspotmail.com
Fixes: 9facc336876f ("bpf: reject any prog that failed read-only lock")
Cc: Laura Abbott <labbott@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/core.c | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a9e6c04d0f4a..1e5625d46414 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	bpf_fill_ill_insns(hdr, size);
 
 	hdr->pages = size / PAGE_SIZE;
-	hdr->locked = 0;
-
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
 		     PAGE_SIZE - sizeof(*hdr));
 	start = (get_random_int() % hole) & ~(alignment - 1);
@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
 	return 0;
 }
 
-static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
-{
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-	int i, err;
-
-	for (i = 0; i < fp->aux->func_cnt; i++) {
-		err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
-		if (err)
-			return err;
-	}
-
-	return bpf_prog_check_pages_ro_single(fp);
-#endif
-	return 0;
-}
-
 static void bpf_prog_select_func(struct bpf_prog *fp)
 {
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
@@ -1524,17 +1506,7 @@ finalize:
 	 * all eBPF JITs might immediately support all features.
 	 */
 	*err = bpf_check_tail_call(fp);
-	if (*err)
-		return fp;
-
-	/* Checkpoint: at this point onwards any cBPF -> eBPF or
-	 * native eBPF program is read-only. If we failed to change
-	 * the page attributes (e.g. allocation failure from
-	 * splitting large pages), then reject the whole program
-	 * in order to guarantee not ending up with any W+X pages
-	 * from BPF side in kernel.
-	 */
-	*err = bpf_prog_check_pages_ro_locked(fp);
+
 	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
-- 
cgit v1.2.3


From 9901c5d77e969d8215a8e8d087ef02e6feddc84c Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 30 Jun 2018 06:17:36 -0700
Subject: bpf: sockmap, fix crash when ipv6 sock is added

This fixes a crash where we assign tcp_prot to IPv6 sockets instead
of tcpv6_prot.

Previously we overwrote the sk->prot field with tcp_prot even in the
AF_INET6 case. This patch ensures the correct tcp_prot and tcpv6_prot
are used.

Tested with 'netserver -6' and 'netperf -H [IPv6]' as well as
'netperf -H [IPv4]'. The ESTABLISHED check resolves the previously
crashing case here.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Reported-by: syzbot+5c063698bdbfac19f363@syzkaller.appspotmail.com
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 58 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 81d0c55a77aa..bfdfbd199c3b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -140,6 +140,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
 			    int offset, size_t size, int flags);
+static void bpf_tcp_close(struct sock *sk, long timeout);
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 {
@@ -161,7 +162,42 @@ out:
 	return !empty;
 }
 
-static struct proto tcp_bpf_proto;
+enum {
+	SOCKMAP_IPV4,
+	SOCKMAP_IPV6,
+	SOCKMAP_NUM_PROTS,
+};
+
+enum {
+	SOCKMAP_BASE,
+	SOCKMAP_TX,
+	SOCKMAP_NUM_CONFIGS,
+};
+
+static struct proto *saved_tcpv6_prot __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
+static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
+			 struct proto *base)
+{
+	prot[SOCKMAP_BASE]			= *base;
+	prot[SOCKMAP_BASE].close		= bpf_tcp_close;
+	prot[SOCKMAP_BASE].recvmsg		= bpf_tcp_recvmsg;
+	prot[SOCKMAP_BASE].stream_memory_read	= bpf_tcp_stream_read;
+
+	prot[SOCKMAP_TX]			= prot[SOCKMAP_BASE];
+	prot[SOCKMAP_TX].sendmsg		= bpf_tcp_sendmsg;
+	prot[SOCKMAP_TX].sendpage		= bpf_tcp_sendpage;
+}
+
+static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
+{
+	int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
+	int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
+
+	sk->sk_prot = &bpf_tcp_prots[family][conf];
+}
+
 static int bpf_tcp_init(struct sock *sk)
 {
 	struct smap_psock *psock;
@@ -181,14 +217,17 @@ static int bpf_tcp_init(struct sock *sk)
 	psock->save_close = sk->sk_prot->close;
 	psock->sk_proto = sk->sk_prot;
 
-	if (psock->bpf_tx_msg) {
-		tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
-		tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
-		tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg;
-		tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read;
+	/* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
+	if (sk->sk_family == AF_INET6 &&
+	    unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
+		spin_lock_bh(&tcpv6_prot_lock);
+		if (likely(sk->sk_prot != saved_tcpv6_prot)) {
+			build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
+			smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
+		}
+		spin_unlock_bh(&tcpv6_prot_lock);
 	}
-
-	sk->sk_prot = &tcp_bpf_proto;
+	update_sk_prot(sk, psock);
 	rcu_read_unlock();
 	return 0;
 }
@@ -1111,8 +1150,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock,
 
 static int bpf_tcp_ulp_register(void)
 {
-	tcp_bpf_proto = tcp_prot;
-	tcp_bpf_proto.close = bpf_tcp_close;
+	build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
 	/* Once BPF TX ULP is registered it is never unregistered. It
 	 * will be in the ULP list for the lifetime of the system. Doing
 	 * duplicate registers is not a problem.
-- 
cgit v1.2.3


From 54fedb42c6537dcb0102e4a58a88456a6286999d Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 30 Jun 2018 06:17:41 -0700
Subject: bpf: sockmap, fix smap_list_map_remove when psock is in many maps

If a hashmap is free'd with open socks it removes the reference to
the hash entry from the psock. If that is the last reference to the
psock then it will also be free'd by the reference counting logic.
However the current logic that removes the hash reference from the
list of references is broken. In smap_list_remove() we first check
if the sockmap entry matches and then check if the hashmap entry
matches. But, the sockmap entry sill always match because its NULL in
this case which causes the first entry to be removed from the list.
If this is always the "right" entry (because the user adds/removes
entries in order) then everything is OK but otherwise a subsequent
bpf_tcp_close() may reference a free'd object.

To fix this create two list handlers one for sockmap and one for
sockhash.

Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index bfdfbd199c3b..65a937ed5762 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1602,17 +1602,27 @@ free_stab:
 	return ERR_PTR(err);
 }
 
-static void smap_list_remove(struct smap_psock *psock,
-			     struct sock **entry,
-			     struct htab_elem *hash_link)
+static void smap_list_map_remove(struct smap_psock *psock,
+				 struct sock **entry)
 {
 	struct smap_psock_map_entry *e, *tmp;
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		if (e->entry == entry || e->hash_link == hash_link) {
+		if (e->entry == entry)
+			list_del(&e->list);
+	}
+}
+
+static void smap_list_hash_remove(struct smap_psock *psock,
+				  struct htab_elem *hash_link)
+{
+	struct smap_psock_map_entry *e, *tmp;
+
+	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+		struct htab_elem *c = e->hash_link;
+
+		if (c == hash_link)
 			list_del(&e->list);
-			break;
-		}
 	}
 }
 
@@ -1647,7 +1657,7 @@ static void sock_map_free(struct bpf_map *map)
 		 * to be null and queued for garbage collection.
 		 */
 		if (likely(psock)) {
-			smap_list_remove(psock, &stab->sock_map[i], NULL);
+			smap_list_map_remove(psock, &stab->sock_map[i]);
 			smap_release_sock(psock, sock);
 		}
 		write_unlock_bh(&sock->sk_callback_lock);
@@ -1706,7 +1716,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (psock->bpf_parse)
 		smap_stop_sock(psock, sock);
-	smap_list_remove(psock, &stab->sock_map[k], NULL);
+	smap_list_map_remove(psock, &stab->sock_map[k]);
 	smap_release_sock(psock, sock);
 out:
 	write_unlock_bh(&sock->sk_callback_lock);
@@ -1908,7 +1918,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		struct smap_psock *opsock = smap_psock_sk(osock);
 
 		write_lock_bh(&osock->sk_callback_lock);
-		smap_list_remove(opsock, &stab->sock_map[i], NULL);
+		smap_list_map_remove(opsock, &stab->sock_map[i]);
 		smap_release_sock(opsock, osock);
 		write_unlock_bh(&osock->sk_callback_lock);
 	}
@@ -2142,7 +2152,7 @@ static void sock_hash_free(struct bpf_map *map)
 			 * (psock) to be null and queued for garbage collection.
 			 */
 			if (likely(psock)) {
-				smap_list_remove(psock, NULL, l);
+				smap_list_hash_remove(psock, l);
 				smap_release_sock(psock, sock);
 			}
 			write_unlock_bh(&sock->sk_callback_lock);
@@ -2322,7 +2332,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		psock = smap_psock_sk(l_old->sk);
 
 		hlist_del_rcu(&l_old->hash_node);
-		smap_list_remove(psock, NULL, l_old);
+		smap_list_hash_remove(psock, l_old);
 		smap_release_sock(psock, l_old->sk);
 		free_htab_elem(htab, l_old);
 	}
@@ -2390,7 +2400,7 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
 		 * to be null and queued for garbage collection.
 		 */
 		if (likely(psock)) {
-			smap_list_remove(psock, NULL, l);
+			smap_list_hash_remove(psock, l);
 			smap_release_sock(psock, sock);
 		}
 		write_unlock_bh(&sock->sk_callback_lock);
-- 
cgit v1.2.3


From e9db4ef6bf4ca9894bb324c76e01b8f1a16b2650 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 30 Jun 2018 06:17:47 -0700
Subject: bpf: sockhash fix omitted bucket lock in sock_close

First the sk_callback_lock() was being used to protect both the
sock callback hooks and the psock->maps list. This got overly
convoluted after the addition of sockhash (in sockmap it made
some sense because masp and callbacks were tightly coupled) so
lets split out a specific lock for maps and only use the callback
lock for its intended purpose. This fixes a couple cases where
we missed using maps lock when it was in fact needed. Also this
makes it easier to follow the code because now we can put the
locking closer to the actual code its serializing.

Next, in sock_hash_delete_elem() the pattern was as follows,

  sock_hash_delete_elem()
     [...]
     spin_lock(bucket_lock)
     l = lookup_elem_raw()
     if (l)
        hlist_del_rcu()
        write_lock(sk_callback_lock)
         .... destroy psock ...
        write_unlock(sk_callback_lock)
     spin_unlock(bucket_lock)

The ordering is necessary because we only know the {p}sock after
dereferencing the hash table which we can't do unless we have the
bucket lock held. Once we have the bucket lock and the psock element
it is deleted from the hashmap to ensure any other path doing a lookup
will fail. Finally, the refcnt is decremented and if zero the psock
is destroyed.

In parallel with the above (or free'ing the map) a tcp close event
may trigger tcp_close(). Which at the moment omits the bucket lock
altogether (oops!) where the flow looks like this,

  bpf_tcp_close()
     [...]
     write_lock(sk_callback_lock)
     for each psock->maps // list of maps this sock is part of
         hlist_del_rcu(ref_hash_node);
         .... destroy psock ...
     write_unlock(sk_callback_lock)

Obviously, and demonstrated by syzbot, this is broken because
we can have multiple threads deleting entries via hlist_del_rcu().

To fix this we might be tempted to wrap the hlist operation in a
bucket lock but that would create a lock inversion problem. In
summary to follow locking rules the psocks maps list needs the
sk_callback_lock (after this patch maps_lock) but we need the bucket
lock to do the hlist_del_rcu.

To resolve the lock inversion problem pop the head of the maps list
repeatedly and remove the reference until no more are left. If a
delete happens in parallel from the BPF API that is OK as well because
it will do a similar action, lookup the lock in the map/hash, delete
it from the map/hash, and dec the refcnt. We check for this case
before doing a destroy on the psock to ensure we don't have two
threads tearing down a psock. The new logic is as follows,

  bpf_tcp_close()
  e = psock_map_pop(psock->maps) // done with map lock
  bucket_lock() // lock hash list bucket
  l = lookup_elem_raw(head, hash, key, key_size);
  if (l) {
     //only get here if elmnt was not already removed
     hlist_del_rcu()
     ... destroy psock...
  }
  bucket_unlock()

And finally for all the above to work add missing locking around  map
operations per above. Then add RCU annotations and use
rcu_dereference/rcu_assign_pointer to manage values relying on RCU so
that the object is not free'd from sock_hash_free() while it is being
referenced in bpf_tcp_close().

Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 145 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 96 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 65a937ed5762..ac09b35a9567 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -72,6 +72,7 @@ struct bpf_htab {
 	u32 n_buckets;
 	u32 elem_size;
 	struct bpf_sock_progs progs;
+	struct rcu_head rcu;
 };
 
 struct htab_elem {
@@ -89,8 +90,8 @@ enum smap_psock_state {
 struct smap_psock_map_entry {
 	struct list_head list;
 	struct sock **entry;
-	struct htab_elem *hash_link;
-	struct bpf_htab *htab;
+	struct htab_elem __rcu *hash_link;
+	struct bpf_htab __rcu *htab;
 };
 
 struct smap_psock {
@@ -120,6 +121,7 @@ struct smap_psock {
 	struct bpf_prog *bpf_parse;
 	struct bpf_prog *bpf_verdict;
 	struct list_head maps;
+	spinlock_t maps_lock;
 
 	/* Back reference used when sock callback trigger sockmap operations */
 	struct sock *sock;
@@ -258,16 +260,54 @@ out:
 	rcu_read_unlock();
 }
 
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+					 u32 hash, void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node) {
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+	}
+
+	return NULL;
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
 	atomic_dec(&htab->count);
 	kfree_rcu(l, rcu);
 }
 
+static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
+						  struct smap_psock *psock)
+{
+	struct smap_psock_map_entry *e;
+
+	spin_lock_bh(&psock->maps_lock);
+	e = list_first_entry_or_null(&psock->maps,
+				     struct smap_psock_map_entry,
+				     list);
+	if (e)
+		list_del(&e->list);
+	spin_unlock_bh(&psock->maps_lock);
+	return e;
+}
+
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
 	void (*close_fun)(struct sock *sk, long timeout);
-	struct smap_psock_map_entry *e, *tmp;
+	struct smap_psock_map_entry *e;
 	struct sk_msg_buff *md, *mtmp;
 	struct smap_psock *psock;
 	struct sock *osk;
@@ -286,7 +326,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 	 */
 	close_fun = psock->save_close;
 
-	write_lock_bh(&sk->sk_callback_lock);
 	if (psock->cork) {
 		free_start_sg(psock->sock, psock->cork);
 		kfree(psock->cork);
@@ -299,20 +338,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 		kfree(md);
 	}
 
-	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+	e = psock_map_pop(sk, psock);
+	while (e) {
 		if (e->entry) {
 			osk = cmpxchg(e->entry, sk, NULL);
 			if (osk == sk) {
-				list_del(&e->list);
 				smap_release_sock(psock, sk);
 			}
 		} else {
-			hlist_del_rcu(&e->hash_link->hash_node);
-			smap_release_sock(psock, e->hash_link->sk);
-			free_htab_elem(e->htab, e->hash_link);
+			struct htab_elem *link = rcu_dereference(e->hash_link);
+			struct bpf_htab *htab = rcu_dereference(e->htab);
+			struct hlist_head *head;
+			struct htab_elem *l;
+			struct bucket *b;
+
+			b = __select_bucket(htab, link->hash);
+			head = &b->head;
+			raw_spin_lock_bh(&b->lock);
+			l = lookup_elem_raw(head,
+					    link->hash, link->key,
+					    htab->map.key_size);
+			/* If another thread deleted this object skip deletion.
+			 * The refcnt on psock may or may not be zero.
+			 */
+			if (l) {
+				hlist_del_rcu(&link->hash_node);
+				smap_release_sock(psock, link->sk);
+				free_htab_elem(htab, link);
+			}
+			raw_spin_unlock_bh(&b->lock);
 		}
+		e = psock_map_pop(sk, psock);
 	}
-	write_unlock_bh(&sk->sk_callback_lock);
 	rcu_read_unlock();
 	close_fun(sk, timeout);
 }
@@ -1395,7 +1452,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
 {
 	if (refcount_dec_and_test(&psock->refcnt)) {
 		tcp_cleanup_ulp(sock);
+		write_lock_bh(&sock->sk_callback_lock);
 		smap_stop_sock(psock, sock);
+		write_unlock_bh(&sock->sk_callback_lock);
 		clear_bit(SMAP_TX_RUNNING, &psock->state);
 		rcu_assign_sk_user_data(sock, NULL);
 		call_rcu_sched(&psock->rcu, smap_destroy_psock);
@@ -1546,6 +1605,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node)
 	INIT_LIST_HEAD(&psock->maps);
 	INIT_LIST_HEAD(&psock->ingress);
 	refcount_set(&psock->refcnt, 1);
+	spin_lock_init(&psock->maps_lock);
 
 	rcu_assign_sk_user_data(sock, psock);
 	sock_hold(sock);
@@ -1607,10 +1667,12 @@ static void smap_list_map_remove(struct smap_psock *psock,
 {
 	struct smap_psock_map_entry *e, *tmp;
 
+	spin_lock_bh(&psock->maps_lock);
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
 		if (e->entry == entry)
 			list_del(&e->list);
 	}
+	spin_unlock_bh(&psock->maps_lock);
 }
 
 static void smap_list_hash_remove(struct smap_psock *psock,
@@ -1618,12 +1680,14 @@ static void smap_list_hash_remove(struct smap_psock *psock,
 {
 	struct smap_psock_map_entry *e, *tmp;
 
+	spin_lock_bh(&psock->maps_lock);
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		struct htab_elem *c = e->hash_link;
+		struct htab_elem *c = rcu_dereference(e->hash_link);
 
 		if (c == hash_link)
 			list_del(&e->list);
 	}
+	spin_unlock_bh(&psock->maps_lock);
 }
 
 static void sock_map_free(struct bpf_map *map)
@@ -1649,7 +1713,6 @@ static void sock_map_free(struct bpf_map *map)
 		if (!sock)
 			continue;
 
-		write_lock_bh(&sock->sk_callback_lock);
 		psock = smap_psock_sk(sock);
 		/* This check handles a racing sock event that can get the
 		 * sk_callback_lock before this case but after xchg happens
@@ -1660,7 +1723,6 @@ static void sock_map_free(struct bpf_map *map)
 			smap_list_map_remove(psock, &stab->sock_map[i]);
 			smap_release_sock(psock, sock);
 		}
-		write_unlock_bh(&sock->sk_callback_lock);
 	}
 	rcu_read_unlock();
 
@@ -1709,7 +1771,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 	if (!sock)
 		return -EINVAL;
 
-	write_lock_bh(&sock->sk_callback_lock);
 	psock = smap_psock_sk(sock);
 	if (!psock)
 		goto out;
@@ -1719,7 +1780,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 	smap_list_map_remove(psock, &stab->sock_map[k]);
 	smap_release_sock(psock, sock);
 out:
-	write_unlock_bh(&sock->sk_callback_lock);
 	return 0;
 }
 
@@ -1800,7 +1860,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 		}
 	}
 
-	write_lock_bh(&sock->sk_callback_lock);
 	psock = smap_psock_sk(sock);
 
 	/* 2. Do not allow inheriting programs if psock exists and has
@@ -1857,7 +1916,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 		if (err)
 			goto out_free;
 		smap_init_progs(psock, verdict, parse);
+		write_lock_bh(&sock->sk_callback_lock);
 		smap_start_sock(psock, sock);
+		write_unlock_bh(&sock->sk_callback_lock);
 	}
 
 	/* 4. Place psock in sockmap for use and stop any programs on
@@ -1867,9 +1928,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	 */
 	if (map_link) {
 		e->entry = map_link;
+		spin_lock_bh(&psock->maps_lock);
 		list_add_tail(&e->list, &psock->maps);
+		spin_unlock_bh(&psock->maps_lock);
 	}
-	write_unlock_bh(&sock->sk_callback_lock);
 	return err;
 out_free:
 	smap_release_sock(psock, sock);
@@ -1880,7 +1942,6 @@ out_progs:
 	}
 	if (tx_msg)
 		bpf_prog_put(tx_msg);
-	write_unlock_bh(&sock->sk_callback_lock);
 	kfree(e);
 	return err;
 }
@@ -1917,10 +1978,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	if (osock) {
 		struct smap_psock *opsock = smap_psock_sk(osock);
 
-		write_lock_bh(&osock->sk_callback_lock);
 		smap_list_map_remove(opsock, &stab->sock_map[i]);
 		smap_release_sock(opsock, osock);
-		write_unlock_bh(&osock->sk_callback_lock);
 	}
 out:
 	return err;
@@ -2109,14 +2168,13 @@ free_htab:
 	return ERR_PTR(err);
 }
 
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+static void __bpf_htab_free(struct rcu_head *rcu)
 {
-	return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
+	struct bpf_htab *htab;
 
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
-	return &__select_bucket(htab, hash)->head;
+	htab = container_of(rcu, struct bpf_htab, rcu);
+	bpf_map_area_free(htab->buckets);
+	kfree(htab);
 }
 
 static void sock_hash_free(struct bpf_map *map)
@@ -2135,16 +2193,18 @@ static void sock_hash_free(struct bpf_map *map)
 	 */
 	rcu_read_lock();
 	for (i = 0; i < htab->n_buckets; i++) {
-		struct hlist_head *head = select_bucket(htab, i);
+		struct bucket *b = __select_bucket(htab, i);
+		struct hlist_head *head;
 		struct hlist_node *n;
 		struct htab_elem *l;
 
+		raw_spin_lock_bh(&b->lock);
+		head = &b->head;
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			struct sock *sock = l->sk;
 			struct smap_psock *psock;
 
 			hlist_del_rcu(&l->hash_node);
-			write_lock_bh(&sock->sk_callback_lock);
 			psock = smap_psock_sk(sock);
 			/* This check handles a racing sock event that can get
 			 * the sk_callback_lock before this case but after xchg
@@ -2155,13 +2215,12 @@ static void sock_hash_free(struct bpf_map *map)
 				smap_list_hash_remove(psock, l);
 				smap_release_sock(psock, sock);
 			}
-			write_unlock_bh(&sock->sk_callback_lock);
-			kfree(l);
+			free_htab_elem(htab, l);
 		}
+		raw_spin_unlock_bh(&b->lock);
 	}
 	rcu_read_unlock();
-	bpf_map_area_free(htab->buckets);
-	kfree(htab);
+	call_rcu(&htab->rcu, __bpf_htab_free);
 }
 
 static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
@@ -2188,19 +2247,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
 	return l_new;
 }
 
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
-					 u32 hash, void *key, u32 key_size)
-{
-	struct htab_elem *l;
-
-	hlist_for_each_entry_rcu(l, head, hash_node) {
-		if (l->hash == hash && !memcmp(&l->key, key, key_size))
-			return l;
-	}
-
-	return NULL;
-}
-
 static inline u32 htab_map_hash(const void *key, u32 key_len)
 {
 	return jhash(key, key_len, 0);
@@ -2320,9 +2366,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		goto bucket_err;
 	}
 
-	e->hash_link = l_new;
-	e->htab = container_of(map, struct bpf_htab, map);
+	rcu_assign_pointer(e->hash_link, l_new);
+	rcu_assign_pointer(e->htab,
+			   container_of(map, struct bpf_htab, map));
+	spin_lock_bh(&psock->maps_lock);
 	list_add_tail(&e->list, &psock->maps);
+	spin_unlock_bh(&psock->maps_lock);
 
 	/* add new element to the head of the list, so that
 	 * concurrent search will find it before old elem
@@ -2392,7 +2441,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
 		struct smap_psock *psock;
 
 		hlist_del_rcu(&l->hash_node);
-		write_lock_bh(&sock->sk_callback_lock);
 		psock = smap_psock_sk(sock);
 		/* This check handles a racing sock event that can get the
 		 * sk_callback_lock before this case but after xchg happens
@@ -2403,7 +2451,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
 			smap_list_hash_remove(psock, l);
 			smap_release_sock(psock, sock);
 		}
-		write_unlock_bh(&sock->sk_callback_lock);
 		free_htab_elem(htab, l);
 		ret = 0;
 	}
-- 
cgit v1.2.3


From caac76a5170eb508529bbff9d9300e9c57126444 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 30 Jun 2018 06:17:52 -0700
Subject: bpf: sockhash, add release routine

Add map_release_uref pointer to hashmap ops. This was dropped when
original sockhash code was ported into bpf-next before initial
commit.

Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index ac09b35a9567..cf7b6a6dbd1f 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2496,6 +2496,7 @@ const struct bpf_map_ops sock_hash_ops = {
 	.map_get_next_key = sock_hash_get_next_key,
 	.map_update_elem = sock_hash_update_elem,
 	.map_delete_elem = sock_hash_delete_elem,
+	.map_release_uref = sock_map_release,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
-- 
cgit v1.2.3


From d9c0ffcabd6aae7ff1e34e8078354c13bb9f1183 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 28 Jun 2018 18:29:41 +0200
Subject: sched/nohz: Skip remote tick on idle task entirely

Some people have reported that the warning in sched_tick_remote()
occasionally triggers, especially in favour of some RCU-Torture
pressure:

	WARNING: CPU: 11 PID: 906 at kernel/sched/core.c:3138 sched_tick_remote+0xb6/0xc0
	Modules linked in:
	CPU: 11 PID: 906 Comm: kworker/u32:3 Not tainted 4.18.0-rc2+ #1
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
	Workqueue: events_unbound sched_tick_remote
	RIP: 0010:sched_tick_remote+0xb6/0xc0
	Code: e8 0f 06 b8 00 c6 03 00 fb eb 9d 8b 43 04 85 c0 75 8d 48 8b 83 e0 0a 00 00 48 85 c0 75 81 eb 88 48 89 df e8 bc fe ff ff eb aa <0f> 0b eb
	+c5 66 0f 1f 44 00 00 bf 17 00 00 00 e8 b6 2e fe ff 0f b6
	Call Trace:
	 process_one_work+0x1df/0x3b0
	 worker_thread+0x44/0x3d0
	 kthread+0xf3/0x130
	 ? set_worker_desc+0xb0/0xb0
	 ? kthread_create_worker_on_cpu+0x70/0x70
	 ret_from_fork+0x35/0x40

This happens when the remote tick applies on an idle task. Usually the
idle_cpu() check avoids that, but it is performed before we lock the
runqueue and it is therefore racy. It was intended to be that way in
order to prevent from useless runqueue locks since idle task tick
callback is a no-op.

Now if the racy check slips out of our hands and we end up remotely
ticking an idle task, the empty task_tick_idle() is harmless. Still
it won't pass the WARN_ON_ONCE() test that ensures rq_clock_task() is
not too far from curr->se.exec_start because update_curr_idle() doesn't
update the exec_start value like other scheduler policies. Hence the
reported false positive.

So let's have another check, while the rq is locked, to make sure we
don't remote tick on an idle task. The lockless idle_cpu() still applies
to avoid unecessary rq lock contention.

Reported-by: Jacek Tomaka <jacekt@dug.com>
Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1530203381-31234-1-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78d8facba456..22fce36426c0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3113,7 +3113,9 @@ static void sched_tick_remote(struct work_struct *work)
 	struct tick_work *twork = container_of(dwork, struct tick_work, work);
 	int cpu = twork->cpu;
 	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *curr;
 	struct rq_flags rf;
+	u64 delta;
 
 	/*
 	 * Handle the tick only if it appears the remote CPU is running in full
@@ -3122,24 +3124,28 @@ static void sched_tick_remote(struct work_struct *work)
 	 * statistics and checks timeslices in a time-independent way, regardless
 	 * of when exactly it is running.
 	 */
-	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
-		struct task_struct *curr;
-		u64 delta;
+	if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
+		goto out_requeue;
 
-		rq_lock_irq(rq, &rf);
-		update_rq_clock(rq);
-		curr = rq->curr;
-		delta = rq_clock_task(rq) - curr->se.exec_start;
+	rq_lock_irq(rq, &rf);
+	curr = rq->curr;
+	if (is_idle_task(curr))
+		goto out_unlock;
 
-		/*
-		 * Make sure the next tick runs within a reasonable
-		 * amount of time.
-		 */
-		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
-		curr->sched_class->task_tick(rq, curr, 0);
-		rq_unlock_irq(rq, &rf);
-	}
+	update_rq_clock(rq);
+	delta = rq_clock_task(rq) - curr->se.exec_start;
+
+	/*
+	 * Make sure the next tick runs within a reasonable
+	 * amount of time.
+	 */
+	WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+	curr->sched_class->task_tick(rq, curr, 0);
+
+out_unlock:
+	rq_unlock_irq(rq, &rf);
 
+out_requeue:
 	/*
 	 * Run the remote tick once per second (1Hz). This arbitrary
 	 * frequency is large enough to avoid overload but short enough
-- 
cgit v1.2.3


From 296b2ffe7fa9ed756c41415c6b1512bc4ad687b1 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 26 Jun 2018 15:53:22 +0200
Subject: sched/rt: Fix call to cpufreq_update_util()

With commit:

  8f111bc357aa ("cpufreq/schedutil: Rewrite CPUFREQ_RT support")

the schedutil governor uses rq->rt.rt_nr_running to detect whether an
RT task is currently running on the CPU and to set frequency to max
if necessary.

cpufreq_update_util() is called in enqueue/dequeue_top_rt_rq() but
rq->rt.rt_nr_running has not been updated yet when dequeue_top_rt_rq() is
called so schedutil still considers that an RT task is running when the
last task is dequeued. The update of rq->rt.rt_nr_running happens later
in dequeue_rt_stack().

In fact, we can take advantage of the sequence that the dequeue then
re-enqueue rt entities when a rt task is enqueued or dequeued;
As a result enqueue_top_rt_rq() is always called when a task is
enqueued or dequeued and also when groups are throttled or unthrottled.
The only place that not use enqueue_top_rt_rq() is when root rt_rq is
throttled.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: efault@gmx.de
Cc: juri.lelli@redhat.com
Cc: patrick.bellasi@arm.com
Cc: viresh.kumar@linaro.org
Fixes: 8f111bc357aa ('cpufreq/schedutil: Rewrite CPUFREQ_RT support')
Link: http://lkml.kernel.org/r/1530021202-21695-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpufreq_schedutil.c |  2 +-
 kernel/sched/rt.c                | 16 ++++++++++------
 kernel/sched/sched.h             |  5 +++++
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3cde46483f0a..c907fde01eaa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
 
-	if (rq->rt.rt_nr_running)
+	if (rt_rq_is_runnable(&rq->rt))
 		return sg_cpu->max;
 
 	/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 47556b0c9a95..572567078b60 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 
 	rt_se = rt_rq->tg->rt_se[cpu];
 
-	if (!rt_se)
+	if (!rt_se) {
 		dequeue_top_rt_rq(rt_rq);
+		/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+		cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
+	}
 	else if (on_rt_rq(rt_se))
 		dequeue_rt_entity(rt_se, 0);
 }
@@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
 	sub_nr_running(rq, rt_rq->rt_nr_running);
 	rt_rq->rt_queued = 0;
 
-	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
-	cpufreq_update_util(rq, 0);
 }
 
 static void
@@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
 
 	if (rt_rq->rt_queued)
 		return;
-	if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+
+	if (rt_rq_throttled(rt_rq))
 		return;
 
-	add_nr_running(rq, rt_rq->rt_nr_running);
-	rt_rq->rt_queued = 1;
+	if (rt_rq->rt_nr_running) {
+		add_nr_running(rq, rt_rq->rt_nr_running);
+		rt_rq->rt_queued = 1;
+	}
 
 	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
 	cpufreq_update_util(rq, 0);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6601baf2361c..27ddec334601 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -609,6 +609,11 @@ struct rt_rq {
 #endif
 };
 
+static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_queued && rt_rq->rt_nr_running;
+}
+
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
 	/* runqueue is an rbtree, ordered by deadline */
-- 
cgit v1.2.3


From 512ac999d2755d2b7109e996a76b6fb8b888631d Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@linux.alibaba.com>
Date: Wed, 20 Jun 2018 18:18:33 +0800
Subject: sched/fair: Fix bandwidth timer clock drift condition

I noticed that cgroup task groups constantly get throttled even
if they have low CPU usage, this causes some jitters on the response
time to some of our business containers when enabling CPU quotas.

It's very simple to reproduce:

  mkdir /sys/fs/cgroup/cpu/test
  cd /sys/fs/cgroup/cpu/test
  echo 100000 > cpu.cfs_quota_us
  echo $$ > tasks

then repeat:

  cat cpu.stat | grep nr_throttled  # nr_throttled will increase steadily

After some analysis, we found that cfs_rq::runtime_remaining will
be cleared by expire_cfs_rq_runtime() due to two equal but stale
"cfs_{b|q}->runtime_expires" after period timer is re-armed.

The current condition to judge clock drift in expire_cfs_rq_runtime()
is wrong, the two runtime_expires are actually the same when clock
drift happens, so this condtion can never hit. The orginal design was
correctly done by this commit:

  a9cf55b28610 ("sched: Expire invalid runtime")

... but was changed to be the current implementation due to its locking bug.

This patch introduces another way, it adds a new field in both structures
cfs_rq and cfs_bandwidth to record the expiration update sequence, and
uses them to figure out if clock drift happens (true if they are equal).

Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")
Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 14 ++++++++------
 kernel/sched/sched.h |  6 ++++--
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1866e64792a7..791707c56886 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4590,6 +4590,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 	now = sched_clock_cpu(smp_processor_id());
 	cfs_b->runtime = cfs_b->quota;
 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+	cfs_b->expires_seq++;
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4612,6 +4613,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 	u64 amount = 0, min_amount, expires;
+	int expires_seq;
 
 	/* note: this is a positive sum as runtime_remaining <= 0 */
 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4628,6 +4630,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 			cfs_b->idle = 0;
 		}
 	}
+	expires_seq = cfs_b->expires_seq;
 	expires = cfs_b->runtime_expires;
 	raw_spin_unlock(&cfs_b->lock);
 
@@ -4637,8 +4640,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 * spread between our sched_clock and the one on which runtime was
 	 * issued.
 	 */
-	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+	if (cfs_rq->expires_seq != expires_seq) {
+		cfs_rq->expires_seq = expires_seq;
 		cfs_rq->runtime_expires = expires;
+	}
 
 	return cfs_rq->runtime_remaining > 0;
 }
@@ -4664,12 +4669,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 * has not truly expired.
 	 *
 	 * Fortunately we can check determine whether this the case by checking
-	 * whether the global deadline has advanced. It is valid to compare
-	 * cfs_b->runtime_expires without any locks since we only care about
-	 * exact equality, so a partial write will still work.
+	 * whether the global deadline(cfs_b->expires_seq) has advanced.
 	 */
-
-	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
+	if (cfs_rq->expires_seq == cfs_b->expires_seq) {
 		/* extend local deadline, drift is bounded above by 2 ticks */
 		cfs_rq->runtime_expires += TICK_NSEC;
 	} else {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 27ddec334601..c7742dcc136c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -334,9 +334,10 @@ struct cfs_bandwidth {
 	u64			runtime;
 	s64			hierarchical_quota;
 	u64			runtime_expires;
+	int			expires_seq;
 
-	int			idle;
-	int			period_active;
+	short			idle;
+	short			period_active;
 	struct hrtimer		period_timer;
 	struct hrtimer		slack_timer;
 	struct list_head	throttled_cfs_rq;
@@ -551,6 +552,7 @@ struct cfs_rq {
 
 #ifdef CONFIG_CFS_BANDWIDTH
 	int			runtime_enabled;
+	int			expires_seq;
 	u64			runtime_expires;
 	s64			runtime_remaining;
 
-- 
cgit v1.2.3


From f1d1be8aee6c461652aea8f58bedebaa73d7f4d3 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@linux.alibaba.com>
Date: Wed, 20 Jun 2018 18:18:34 +0800
Subject: sched/fair: Advance global expiration when period timer is restarted

When period gets restarted after some idle time, start_cfs_bandwidth()
doesn't update the expiration information, expire_cfs_rq_runtime() will
see cfs_rq->runtime_expires smaller than rq clock and go to the clock
drift logic, wasting needless CPU cycles on the scheduler hot path.

Update the global expiration in start_cfs_bandwidth() to avoid frequent
expire_cfs_rq_runtime() calls once a new period begins.

Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180620101834.24455-2-xlpang@linux.alibaba.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 791707c56886..840b92ee6f89 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5204,13 +5204,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
+	u64 overrun;
+
 	lockdep_assert_held(&cfs_b->lock);
 
-	if (!cfs_b->period_active) {
-		cfs_b->period_active = 1;
-		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
-		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
-	}
+	if (cfs_b->period_active)
+		return;
+
+	cfs_b->period_active = 1;
+	overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+	cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
+	cfs_b->expires_seq++;
+	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
 }
 
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-- 
cgit v1.2.3


From 3482d98bbc730758b63a5d1cf41d05ea17481412 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 14 Jun 2018 12:33:00 +0200
Subject: sched/util_est: Fix util_est_dequeue() for throttled cfs_rq

When a cfs_rq is throttled, parent cfs_rq->nr_running is decreased and
everything happens at cfs_rq level. Currently util_est stays unchanged
in such case and it keeps accounting the utilization of throttled tasks.
This can somewhat make sense as we don't dequeue tasks but only throttled
cfs_rq.

If a task of another group is enqueued/dequeued and root cfs_rq becomes
idle during the dequeue, util_est will be cleared whereas it was
accounting util_est of throttled tasks before. So the behavior of util_est
is not always the same regarding throttled tasks and depends of side
activity. Furthermore, util_est will not be updated when the cfs_rq is
unthrottled as everything happens at cfs_rq level. Main results is that
util_est will stay null whereas we now have running tasks. We have to wait
for the next dequeue/enqueue of the previously throttled tasks to get an
up to date util_est.

Remove the assumption that cfs_rq's estimated utilization of a CPU is 0
if there is no running task so the util_est of a task remains until the
latter is dequeued even if its cfs_rq has been throttled.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT")
Link: http://lkml.kernel.org/r/1528972380-16268-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 840b92ee6f89..2f0a0be4d344 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 	if (!sched_feat(UTIL_EST))
 		return;
 
-	/*
-	 * Update root cfs_rq's estimated utilization
-	 *
-	 * If *p is the last task then the root cfs_rq's estimated utilization
-	 * of a CPU is 0 by definition.
-	 */
-	ue.enqueued = 0;
-	if (cfs_rq->nr_running) {
-		ue.enqueued  = cfs_rq->avg.util_est.enqueued;
-		ue.enqueued -= min_t(unsigned int, ue.enqueued,
-				     (_task_util_est(p) | UTIL_AVG_UNCHANGED));
-	}
+	/* Update root cfs_rq's estimated utilization */
+	ue.enqueued  = cfs_rq->avg.util_est.enqueued;
+	ue.enqueued -= min_t(unsigned int, ue.enqueued,
+			     (_task_util_est(p) | UTIL_AVG_UNCHANGED));
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
 
 	/*
-- 
cgit v1.2.3


From 1cef1150ef40ec52f507436a14230cbc2623299c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 7 Jun 2018 11:45:49 +0200
Subject: kthread, sched/core: Fix kthread_parkme() (again...)

Gaurav reports that commit:

  85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue")

isn't working for him. Because of the following race:

> controller Thread                               CPUHP Thread
> takedown_cpu
> kthread_park
> kthread_parkme
> Set KTHREAD_SHOULD_PARK
>                                                 smpboot_thread_fn
>                                                 set Task interruptible
>
>
> wake_up_process
>  if (!(p->state & state))
>                 goto out;
>
>                                                 Kthread_parkme
>                                                 SET TASK_PARKED
>                                                 schedule
>                                                 raw_spin_lock(&rq->lock)
> ttwu_remote
> waiting for __task_rq_lock
>                                                 context_switch
>
>                                                 finish_lock_switch
>
>
>
>                                                 Case TASK_PARKED
>                                                 kthread_park_complete
>
>
> SET Running

Furthermore, Oleg noticed that the whole scheduler TASK_PARKED
handling is buggered because the TASK_DEAD thing is done with
preemption disabled, the current code can still complete early on
preemption :/

So basically revert that earlier fix and go with a variant of the
alternative mentioned in the commit. Promote TASK_PARKED to special
state to avoid the store-store issue on task->state leading to the
WARN in kthread_unpark() -> __kthread_bind().

But in addition, add wait_task_inactive() to kthread_park() to ensure
the task really is PARKED when we return from kthread_park(). This
avoids the whole kthread still gets migrated nonsense -- although it
would be really good to get this done differently.

Reported-by: Gaurav Kohli <gkohli@codeaurora.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kthread.c    | 30 ++++++++++++++++++++++++------
 kernel/sched/core.c | 31 +++++++++++--------------------
 2 files changed, 35 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 481951bf091d..750cb8082694 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task)
 static void __kthread_parkme(struct kthread *self)
 {
 	for (;;) {
-		set_current_state(TASK_PARKED);
+		/*
+		 * TASK_PARKED is a special state; we must serialize against
+		 * possible pending wakeups to avoid store-store collisions on
+		 * task->state.
+		 *
+		 * Such a collision might possibly result in the task state
+		 * changin from TASK_PARKED and us failing the
+		 * wait_task_inactive() in kthread_park().
+		 */
+		set_special_state(TASK_PARKED);
 		if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
 			break;
+
+		complete_all(&self->parked);
 		schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -191,11 +202,6 @@ void kthread_parkme(void)
 }
 EXPORT_SYMBOL_GPL(kthread_parkme);
 
-void kthread_park_complete(struct task_struct *k)
-{
-	complete_all(&to_kthread(k)->parked);
-}
-
 static int kthread(void *_create)
 {
 	/* Copy data: it's on kthread's stack */
@@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k)
 
 	reinit_completion(&kthread->parked);
 	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+	/*
+	 * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
+	 */
 	wake_up_state(k, TASK_PARKED);
 }
 EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k)
 	set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
 	if (k != current) {
 		wake_up_process(k);
+		/*
+		 * Wait for __kthread_parkme() to complete(), this means we
+		 * _will_ have TASK_PARKED and are about to call schedule().
+		 */
 		wait_for_completion(&kthread->parked);
+		/*
+		 * Now wait for that schedule() to complete and the task to
+		 * get scheduled out.
+		 */
+		WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
 	}
 
 	return 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 22fce36426c0..fe365c9a08e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,7 +7,6 @@
  */
 #include "sched.h"
 
-#include <linux/kthread.h>
 #include <linux/nospec.h>
 
 #include <linux/kcov.h>
@@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop(mm);
 	}
-	if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) {
-		switch (prev_state) {
-		case TASK_DEAD:
-			if (prev->sched_class->task_dead)
-				prev->sched_class->task_dead(prev);
+	if (unlikely(prev_state == TASK_DEAD)) {
+		if (prev->sched_class->task_dead)
+			prev->sched_class->task_dead(prev);
 
-			/*
-			 * Remove function-return probe instances associated with this
-			 * task and put them back on the free list.
-			 */
-			kprobe_flush_task(prev);
-
-			/* Task is done with its stack. */
-			put_task_stack(prev);
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+		 */
+		kprobe_flush_task(prev);
 
-			put_task_struct(prev);
-			break;
+		/* Task is done with its stack. */
+		put_task_stack(prev);
 
-		case TASK_PARKED:
-			kthread_park_complete(prev);
-			break;
-		}
+		put_task_struct(prev);
 	}
 
 	tick_nohz_task_switch();
-- 
cgit v1.2.3


From cf4d418e653afc84c9c873236033e06be5d58f1c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 28 Mar 2018 16:09:10 +0200
Subject: tracing: Avoid string overflow

'err' is used as a NUL-terminated string, but using strncpy() with the length
equal to the buffer size may result in lack of the termination:

kernel/trace/trace_events_hist.c: In function 'hist_err_event':
kernel/trace/trace_events_hist.c:396:3: error: 'strncpy' specified bound 256 equals destination size [-Werror=stringop-truncation]
   strncpy(err, var, MAX_FILTER_STR_VAL);

This changes it to use the safer strscpy() instead.

Link: http://lkml.kernel.org/r/20180328140920.2842153-1-arnd@arndb.de

Cc: stable@vger.kernel.org
Fixes: f404da6e1d46 ("tracing: Add 'last error' error facility for hist triggers")
Acked-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 046c716a6536..aae18af94c94 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -393,7 +393,7 @@ static void hist_err_event(char *str, char *system, char *event, char *var)
 	else if (system)
 		snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
 	else
-		strncpy(err, var, MAX_FILTER_STR_VAL);
+		strscpy(err, var, MAX_FILTER_STR_VAL);
 
 	hist_err(str, err);
 }
-- 
cgit v1.2.3


From f90658725ba7ebb031054866aff4cda0d099a3b1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 2 Jul 2018 11:41:38 -0400
Subject: tracing: Make create_filter() code match the comments

The comment in create_filter() states that the passed in filter pointer
(filterp) will either be NULL or contain an error message stating why the
filter failed. But it also expects the filter pointer to point to NULL when
passed in. If it is not, the function create_filter_start() will warn and
return an error message without updating the filter pointer. This is not
what the comment states.

As we always expect the pointer to point to NULL, if it is not, trigger a
WARN_ON(), set it to NULL, and then continue the path as the rest will work
as the comment states. Also update the comment to state it must point to
NULL.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0dceb77d1d42..893a206bcba4 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1701,6 +1701,7 @@ static void create_filter_finish(struct filter_parse_error *pe)
  * @filter_str: filter string
  * @set_str: remember @filter_str and enable detailed error in filter
  * @filterp: out param for created filter (always updated on return)
+ *           Must be a pointer that references a NULL pointer.
  *
  * Creates a filter for @call with @filter_str.  If @set_str is %true,
  * @filter_str is copied and recorded in the new filter.
@@ -1718,6 +1719,10 @@ static int create_filter(struct trace_event_call *call,
 	struct filter_parse_error *pe = NULL;
 	int err;
 
+	/* filterp must point to NULL */
+	if (WARN_ON(*filterp))
+		*filterp = NULL;
+
 	err = create_filter_start(filter_string, set_str, &pe, filterp);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From f26808ba7227a921e0e8549c7d3c52332b920085 Mon Sep 17 00:00:00 2001
From: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Date: Sun, 8 Apr 2018 19:36:31 +0800
Subject: tracing: Optimize trace_buffer_iter() logic

Simplify and optimize the logic in trace_buffer_iter() to use a conditional
operation instead of an if conditional.

Link: http://lkml.kernel.org/r/20180408113631.3947-1-cugyly@163.com

Signed-off-by: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 630c5a24b2b2..f8f86231ad90 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -583,9 +583,7 @@ static __always_inline void trace_clear_recursion(int bit)
 static inline struct ring_buffer_iter *
 trace_buffer_iter(struct trace_iterator *iter, int cpu)
 {
-	if (iter->buffer_iter && iter->buffer_iter[cpu])
-		return iter->buffer_iter[cpu];
-	return NULL;
+	return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL;
 }
 
 int tracer_init(struct tracer *t, struct trace_array *tr);
-- 
cgit v1.2.3


From 26b68dd2f48fe7699a89f0cfbb9f4a650dc1c837 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Thu, 8 Mar 2018 21:58:43 +0100
Subject: tracing: Use __printf markup to silence compiler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Silence warnings (triggered at W=1) by adding relevant __printf attributes.

  CC      kernel/trace/trace.o
kernel/trace/trace.c: In function ‘__trace_array_vprintk’:
kernel/trace/trace.c:2979:2: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format]
  len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
  ^~~
  AR      kernel/trace/built-in.o

Link: http://lkml.kernel.org/r/20180308205843.27447-1-malat@debian.org

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0079b4c7a49..f054bd6a1c66 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2953,6 +2953,7 @@ out_nobuffer:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
+__printf(3, 0)
 static int
 __trace_array_vprintk(struct ring_buffer *buffer,
 		      unsigned long ip, const char *fmt, va_list args)
@@ -3007,12 +3008,14 @@ out_nobuffer:
 	return len;
 }
 
+__printf(3, 0)
 int trace_array_vprintk(struct trace_array *tr,
 			unsigned long ip, const char *fmt, va_list args)
 {
 	return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
 }
 
+__printf(3, 0)
 int trace_array_printk(struct trace_array *tr,
 		       unsigned long ip, const char *fmt, ...)
 {
@@ -3028,6 +3031,7 @@ int trace_array_printk(struct trace_array *tr,
 	return ret;
 }
 
+__printf(3, 4)
 int trace_array_printk_buf(struct ring_buffer *buffer,
 			   unsigned long ip, const char *fmt, ...)
 {
@@ -3043,6 +3047,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer,
 	return ret;
 }
 
+__printf(2, 0)
 int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 {
 	return trace_array_vprintk(&global_trace, ip, fmt, args);
-- 
cgit v1.2.3


From 5ccba64a560fa6ca06008d4001f5d46ebeb34b41 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Fri, 2 Feb 2018 10:14:49 +0800
Subject: ftrace: Nuke clear_ftrace_function

clear_ftrace_function is not used outside of ftrace.c and is not help to
use a function, so nuke it per Steve's suggestion.

Link: http://lkml.kernel.org/r/1517537689-34947-1-git-send-email-xieyisheng1@huawei.com

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index efed9c1cfb7e..caf9cbf35816 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -192,17 +192,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 	op->saved_func(ip, parent_ip, op, regs);
 }
 
-/**
- * clear_ftrace_function - reset the ftrace function
- *
- * This NULLs the ftrace function and in essence stops
- * tracing.  There may be lag
- */
-void clear_ftrace_function(void)
-{
-	ftrace_trace_function = ftrace_stub;
-}
-
 static void ftrace_sync(struct work_struct *work)
 {
 	/*
@@ -6689,7 +6678,7 @@ void ftrace_kill(void)
 {
 	ftrace_disabled = 1;
 	ftrace_enabled = 0;
-	clear_ftrace_function();
+	ftrace_trace_function = ftrace_stub;
 }
 
 /**
-- 
cgit v1.2.3


From 1fe4293f4b8de75824935f8d8e9a99c7fc6873da Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@intel.com>
Date: Wed, 31 Jan 2018 23:48:49 +0800
Subject: tracing: Fix missing return symbol in function_graph output

The function_graph tracer does not show the interrupt return marker for the
leaf entry. On leaf entries, we see an unbalanced interrupt marker (the
interrupt was entered, but nevern left).

Before:
 1)               |  SyS_write() {
 1)               |    __fdget_pos() {
 1)   0.061 us    |      __fget_light();
 1)   0.289 us    |    }
 1)               |    vfs_write() {
 1)   0.049 us    |      rw_verify_area();
 1) + 15.424 us   |      __vfs_write();
 1)   ==========> |
 1)   6.003 us    |      smp_apic_timer_interrupt();
 1)   0.055 us    |      __fsnotify_parent();
 1)   0.073 us    |      fsnotify();
 1) + 23.665 us   |    }
 1) + 24.501 us   |  }

After:
 0)               |  SyS_write() {
 0)               |    __fdget_pos() {
 0)   0.052 us    |      __fget_light();
 0)   0.328 us    |    }
 0)               |    vfs_write() {
 0)   0.057 us    |      rw_verify_area();
 0)               |      __vfs_write() {
 0)   ==========> |
 0)   8.548 us    |      smp_apic_timer_interrupt();
 0)   <========== |
 0) + 36.507 us   |      } /* __vfs_write */
 0)   0.049 us    |      __fsnotify_parent();
 0)   0.066 us    |      fsnotify();
 0) + 50.064 us   |    }
 0) + 50.952 us   |  }

Link: http://lkml.kernel.org/r/1517413729-20411-1-git-send-email-changbin.du@intel.com

Cc: stable@vger.kernel.org
Fixes: f8b755ac8e0cc ("tracing/function-graph-tracer: Output arrows signal on hardirq call/return")
Signed-off-by: Changbin Du <changbin.du@intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 23c0b0cb5fb9..169b3c44ee97 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -831,6 +831,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	struct ftrace_graph_ret *graph_ret;
 	struct ftrace_graph_ent *call;
 	unsigned long long duration;
+	int cpu = iter->cpu;
 	int i;
 
 	graph_ret = &ret_entry->ret;
@@ -839,7 +840,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 
 	if (data) {
 		struct fgraph_cpu_data *cpu_data;
-		int cpu = iter->cpu;
 
 		cpu_data = per_cpu_ptr(data->cpu_data, cpu);
 
@@ -869,6 +869,9 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 
 	trace_seq_printf(s, "%ps();\n", (void *)call->func);
 
+	print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
+			cpu, iter->ent->pid, flags);
+
 	return trace_handle_return(s);
 }
 
-- 
cgit v1.2.3


From 5b5ccbc2b041f98f26b984e013d303b7f9e6fb8e Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 9 Jul 2018 16:45:35 +0100
Subject: Revert "tick: Prefer a lower rating device only if it's CPU local
 device"

This reverts commit 1332a90558013ae4242e3dd7934bdcdeafb06c0d.

The original issue was not because of incorrect checking of cpumask for
both new and old tick device. It was incorrectly analysed was due to the
misunderstanding of the comment and misinterpretation of the return value
from tick_check_preferred. The main issue is with the clockevent driver
that sets the cpumask to cpu_all_mask instead of cpu_possible_mask.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Kevin Hilman <khilman@baylibre.com>
Tested-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/1531151136-18297-1-git-send-email-sudeep.holla@arm.com
---
 kernel/time/tick-common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b7005dd21ec1..14de3727b18e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -277,8 +277,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
 	 */
 	return !curdev ||
 		newdev->rating > curdev->rating ||
-	       (!cpumask_equal(curdev->cpumask, newdev->cpumask) &&
-	        !tick_check_percpu(curdev, newdev, smp_processor_id()));
+	       !cpumask_equal(curdev->cpumask, newdev->cpumask);
 }
 
 /*
-- 
cgit v1.2.3


From e96d71359e9bbea846a2111e4469a03a055dfa6f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 9 Jul 2018 15:51:50 -0400
Subject: rseq: Use __u64 for rseq_cs fields, validate user inputs

Change the rseq ABI so rseq_cs start_ip, post_commit_offset and abort_ip
fields are seen as 64-bit fields by both 32-bit and 64-bit kernels rather
that ignoring the 32 upper bits on 32-bit kernels. This ensures we have a
consistent behavior for a 32-bit binary executed on 32-bit kernels and in
compat mode on 64-bit kernels.

Validating the value of abort_ip field to be below TASK_SIZE ensures the
kernel don't return to an invalid address when returning to userspace
after an abort. I don't fully trust each architecture code to consistently
deal with invalid return addresses.

Validating the value of the start_ip and post_commit_offset fields
prevents overflow on arithmetic performed on those values, used to
check whether abort_ip is within the rseq critical section.

If validation fails, the process is killed with a segmentation fault.

When the signature encountered before abort_ip does not match the expected
signature, return -EINVAL rather than -EPERM to be consistent with other
input validation return codes from rseq_get_rseq_cs().

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-api@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Paul Turner <pjt@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chris Lameter <cl@linux.com>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lkml.kernel.org/r/20180709195155.7654-2-mathieu.desnoyers@efficios.com
---
 kernel/rseq.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rseq.c b/kernel/rseq.c
index 22b6acf1ad63..16b38c5342f9 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -130,14 +130,20 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 	urseq_cs = (struct rseq_cs __user *)ptr;
 	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
 		return -EFAULT;
-	if (rseq_cs->version > 0)
-		return -EINVAL;
 
+	if (rseq_cs->start_ip >= TASK_SIZE ||
+	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
+	    rseq_cs->abort_ip >= TASK_SIZE ||
+	    rseq_cs->version > 0)
+		return -EINVAL;
+	/* Check for overflow. */
+	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
+		return -EINVAL;
 	/* Ensure that abort_ip is not in the critical section. */
 	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
 		return -EINVAL;
 
-	usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32));
+	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
 	ret = get_user(sig, usig);
 	if (ret)
 		return ret;
@@ -146,7 +152,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 		printk_ratelimited(KERN_WARNING
 			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
 			sig, current->rseq_sig, current->pid, usig);
-		return -EPERM;
+		return -EINVAL;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 8f28177014925f968baf45fc833c25848faf8c1c Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 9 Jul 2018 15:51:51 -0400
Subject: rseq: Use get_user/put_user rather than __get_user/__put_user

__get_user()/__put_user() is used to read values for address ranges that
were already checked with access_ok() on rseq registration.

It has been recognized that __get_user/__put_user are optimizing the
wrong thing. Replace them by get_user/put_user across rseq instead.

If those end up showing up in benchmarks, the proper approach would be to
use user_access_begin() / unsafe_{get,put}_user() / user_access_end()
anyway.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-api@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Paul Turner <pjt@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chris Lameter <cl@linux.com>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lkml.kernel.org/r/20180709195155.7654-3-mathieu.desnoyers@efficios.com
---
 kernel/rseq.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rseq.c b/kernel/rseq.c
index 16b38c5342f9..2c8463acb50d 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -85,9 +85,9 @@ static int rseq_update_cpu_id(struct task_struct *t)
 {
 	u32 cpu_id = raw_smp_processor_id();
 
-	if (__put_user(cpu_id, &t->rseq->cpu_id_start))
+	if (put_user(cpu_id, &t->rseq->cpu_id_start))
 		return -EFAULT;
-	if (__put_user(cpu_id, &t->rseq->cpu_id))
+	if (put_user(cpu_id, &t->rseq->cpu_id))
 		return -EFAULT;
 	trace_rseq_update(t);
 	return 0;
@@ -100,14 +100,14 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
 	/*
 	 * Reset cpu_id_start to its initial state (0).
 	 */
-	if (__put_user(cpu_id_start, &t->rseq->cpu_id_start))
+	if (put_user(cpu_id_start, &t->rseq->cpu_id_start))
 		return -EFAULT;
 	/*
 	 * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
 	 * in after unregistration can figure out that rseq needs to be
 	 * registered again.
 	 */
-	if (__put_user(cpu_id, &t->rseq->cpu_id))
+	if (put_user(cpu_id, &t->rseq->cpu_id))
 		return -EFAULT;
 	return 0;
 }
@@ -120,7 +120,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 	u32 sig;
 	int ret;
 
-	ret = __get_user(ptr, &t->rseq->rseq_cs);
+	ret = get_user(ptr, &t->rseq->rseq_cs);
 	if (ret)
 		return ret;
 	if (!ptr) {
@@ -163,7 +163,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
 	int ret;
 
 	/* Get thread flags. */
-	ret = __get_user(flags, &t->rseq->flags);
+	ret = get_user(flags, &t->rseq->flags);
 	if (ret)
 		return ret;
 
@@ -203,7 +203,7 @@ static int clear_rseq_cs(struct task_struct *t)
 	 *
 	 * Set rseq_cs to NULL with single-copy atomicity.
 	 */
-	return __put_user(0UL, &t->rseq->rseq_cs);
+	return put_user(0UL, &t->rseq->rseq_cs);
 }
 
 /*
-- 
cgit v1.2.3


From 0fb9a1abc8c97f858997e962694eb36b4517144e Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 9 Jul 2018 15:51:52 -0400
Subject: rseq: uapi: Update uapi comments

Update rseq uapi header comments to reflect that user-space need to do
thread-local loads/stores from/to the struct rseq fields.

As a consequence of this added requirement, the kernel does not need
to perform loads/stores with single-copy atomicity.

Update the comment associated to the "flags" fields to describe
more accurately that it's only useful to facilitate single-stepping
through rseq critical sections with debuggers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-api@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Paul Turner <pjt@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chris Lameter <cl@linux.com>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lkml.kernel.org/r/20180709195155.7654-4-mathieu.desnoyers@efficios.com
---
 kernel/rseq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2c8463acb50d..2a7748675be7 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -201,7 +201,7 @@ static int clear_rseq_cs(struct task_struct *t)
 	 * of code outside of the rseq assembly block. This performs
 	 * a lazy clear of the rseq_cs field.
 	 *
-	 * Set rseq_cs to NULL with single-copy atomicity.
+	 * Set rseq_cs to NULL.
 	 */
 	return put_user(0UL, &t->rseq->rseq_cs);
 }
-- 
cgit v1.2.3


From ec9c82e03a744e5698bd95eab872855861a821fa Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 9 Jul 2018 15:51:53 -0400
Subject: rseq: uapi: Declare rseq_cs field as union, update includes

Declaring the rseq_cs field as a union between __u64 and two __u32
allows both 32-bit and 64-bit kernels to read the full __u64, and
therefore validate that a 32-bit user-space cleared the upper 32
bits, thus ensuring a consistent behavior between native 32-bit
kernels and 32-bit compat tasks on 64-bit kernels.

Check that the rseq_cs value read is < TASK_SIZE.

The asm/byteorder.h header needs to be included by rseq.h, now
that it is not using linux/types_32_64.h anymore.

Considering that only __32 and __u64 types are declared in linux/rseq.h,
the linux/types.h header should always be included for both kernel and
user-space code: including stdint.h is just for u64 and u32, which are
not used in this header at all.

Use copy_from_user()/clear_user() to interact with a 64-bit field,
because arm32 does not implement 64-bit __get_user, and ppc32 does not
64-bit get_user. Considering that the rseq_cs pointer does not need to
be loaded/stored with single-copy atomicity from the kernel anymore, we
can simply use copy_from_user()/clear_user().

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-api@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Paul Turner <pjt@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chris Lameter <cl@linux.com>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lkml.kernel.org/r/20180709195155.7654-5-mathieu.desnoyers@efficios.com
---
 kernel/rseq.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2a7748675be7..c6242d8594dc 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -115,19 +115,20 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
 static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 {
 	struct rseq_cs __user *urseq_cs;
-	unsigned long ptr;
+	u64 ptr;
 	u32 __user *usig;
 	u32 sig;
 	int ret;
 
-	ret = get_user(ptr, &t->rseq->rseq_cs);
-	if (ret)
-		return ret;
+	if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+		return -EFAULT;
 	if (!ptr) {
 		memset(rseq_cs, 0, sizeof(*rseq_cs));
 		return 0;
 	}
-	urseq_cs = (struct rseq_cs __user *)ptr;
+	if (ptr >= TASK_SIZE)
+		return -EINVAL;
+	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
 	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
 		return -EFAULT;
 
@@ -203,7 +204,9 @@ static int clear_rseq_cs(struct task_struct *t)
 	 *
 	 * Set rseq_cs to NULL.
 	 */
-	return put_user(0UL, &t->rseq->rseq_cs);
+	if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
+		return -EFAULT;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 0fc8c3581dd42bc8f530314ca86db2d861485731 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 9 Jul 2018 16:19:06 +0200
Subject: tracing/kprobe: Release kprobe print_fmt properly

We don't release tk->tp.call.print_fmt when destroying
local uprobe. Also there's missing print_fmt kfree in
create_local_trace_kprobe error path.

Link: http://lkml.kernel.org/r/20180709141906.2390-1-jolsa@kernel.org

Cc: stable@vger.kernel.org
Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU")
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index daa81571b22a..21f718472942 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1480,8 +1480,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
 	}
 
 	ret = __register_trace_kprobe(tk);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(tk->tp.call.print_fmt);
 		goto error;
+	}
 
 	return &tk->tp.call;
 error:
@@ -1501,6 +1503,8 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call)
 	}
 
 	__unregister_trace_kprobe(tk);
+
+	kfree(tk->tp.call.print_fmt);
 	free_trace_kprobe(tk);
 }
 #endif /* CONFIG_PERF_EVENTS */
-- 
cgit v1.2.3


From f8494fa3dd10b52eab47a9666a8bc34719a129aa Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 Jun 2018 17:08:22 -0700
Subject: tracing: Reorder display of TGID to be after PID

Currently ftrace displays data in trace output like so:

                                       _-----=> irqs-off
                                      / _----=> need-resched
                                     | / _---=> hardirq/softirq
                                     || / _--=> preempt-depth
                                     ||| /     delay
            TASK-PID   CPU    TGID   ||||    TIMESTAMP  FUNCTION
               | |       |      |    ||||       |         |
            bash-1091  [000] ( 1091) d..2    28.313544: sched_switch:

However Android's trace visualization tools expect a slightly different
format due to an out-of-tree patch patch that was been carried for a
decade, notice that the TGID and CPU fields are reversed:

                                       _-----=> irqs-off
                                      / _----=> need-resched
                                     | / _---=> hardirq/softirq
                                     || / _--=> preempt-depth
                                     ||| /     delay
            TASK-PID    TGID   CPU   ||||    TIMESTAMP  FUNCTION
               | |        |      |   ||||       |         |
            bash-1091  ( 1091) [002] d..2    64.965177: sched_switch:

From kernel v4.13 onwards, during which TGID was introduced, tracing
with systrace on all Android kernels will break (most Android kernels
have been on 4.9 with Android patches, so this issues hasn't been seen
yet). From v4.13 onwards things will break.

The chrome browser's tracing tools also embed the systrace viewer which
uses the legacy TGID format and updates to that are known to be
difficult to make.

Considering this, I suggest we make this change to the upstream kernel
and backport it to all Android kernels. I believe this feature is merged
recently enough into the upstream kernel that it shouldn't be a problem.
Also logically, IMO it makes more sense to group the TGID with the
TASK-PID and the CPU after these.

Link: http://lkml.kernel.org/r/20180626000822.113931-1-joel@joelfernandes.org

Cc: jreck@google.com
Cc: tkjos@google.com
Cc: stable@vger.kernel.org
Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output")
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 8 ++++----
 kernel/trace/trace_output.c | 5 +++--
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f054bd6a1c66..87cf25171fb8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3365,8 +3365,8 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
 
 	print_event_info(buf, m);
 
-	seq_printf(m, "#           TASK-PID   CPU#   %s  TIMESTAMP  FUNCTION\n", tgid ? "TGID     " : "");
-	seq_printf(m, "#              | |       |    %s     |         |\n",	 tgid ? "  |      " : "");
+	seq_printf(m, "#           TASK-PID   %s  CPU#   TIMESTAMP  FUNCTION\n", tgid ? "TGID     " : "");
+	seq_printf(m, "#              | |     %s    |       |         |\n",	 tgid ? "  |      " : "");
 }
 
 static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
@@ -3386,9 +3386,9 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
 		   tgid ? tgid_space : space);
 	seq_printf(m, "#                          %s||| /     delay\n",
 		   tgid ? tgid_space : space);
-	seq_printf(m, "#           TASK-PID   CPU#%s||||    TIMESTAMP  FUNCTION\n",
+	seq_printf(m, "#           TASK-PID %sCPU#  ||||    TIMESTAMP  FUNCTION\n",
 		   tgid ? "   TGID   " : space);
-	seq_printf(m, "#              | |       | %s||||       |         |\n",
+	seq_printf(m, "#              | |   %s  |   ||||       |         |\n",
 		   tgid ? "     |    " : space);
 }
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 90db994ac900..1c8e30fda46a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -594,8 +594,7 @@ int trace_print_context(struct trace_iterator *iter)
 
 	trace_find_cmdline(entry->pid, comm);
 
-	trace_seq_printf(s, "%16s-%-5d [%03d] ",
-			       comm, entry->pid, iter->cpu);
+	trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
 
 	if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
 		unsigned int tgid = trace_find_tgid(entry->pid);
@@ -606,6 +605,8 @@ int trace_print_context(struct trace_iterator *iter)
 			trace_seq_printf(s, "(%5d) ", tgid);
 	}
 
+	trace_seq_printf(s, "[%03d] ", iter->cpu);
+
 	if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
 		trace_print_lat_fmt(s, entry);
 
-- 
cgit v1.2.3