From 610467270fb368584b74567edd21c8cc5104490f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Jul 2017 07:17:02 -0400 Subject: cgroup: don't call migration methods if there are no tasks to migrate Subsystem migration methods shouldn't be called for empty migrations. cgroup_migrate_execute() implements this guarantee by bailing early if there are no source css_sets. This used to be correct before a79a908fd2b0 ("cgroup: introduce cgroup namespaces"), but no longer since the commit because css_sets can stay pinned without tasks in them. This caused cgroup_migrate_execute() call into cpuset migration methods with an empty cgroup_taskset. cpuset migration methods correctly assume that cgroup_taskset_first() never returns NULL; however, due to the bug, it can, leading to the following oops. Unable to handle kernel paging request for data at address 0x00000960 Faulting instruction address: 0xc0000000001d6868 Oops: Kernel access of bad area, sig: 11 [#1] ... CPU: 14 PID: 16947 Comm: kworker/14:0 Tainted: G W 4.12.0-rc4-next-20170609 #2 Workqueue: events cpuset_hotplug_workfn task: c00000000ca60580 task.stack: c00000000c728000 NIP: c0000000001d6868 LR: c0000000001d6858 CTR: c0000000001d6810 REGS: c00000000c72b720 TRAP: 0300 Tainted: GW (4.12.0-rc4-next-20170609) MSR: 8000000000009033 CR: 44722422 XER: 20000000 CFAR: c000000000008710 DAR: 0000000000000960 DSISR: 40000000 SOFTE: 1 GPR00: c0000000001d6858 c00000000c72b9a0 c000000001536e00 0000000000000000 GPR04: c00000000c72b9c0 0000000000000000 c00000000c72bad0 c000000766367678 GPR08: c000000766366d10 c00000000c72b958 c000000001736e00 0000000000000000 GPR12: c0000000001d6810 c00000000e749300 c000000000123ef8 c000000775af4180 GPR16: 0000000000000000 0000000000000000 c00000075480e9c0 c00000075480e9e0 GPR20: c00000075480e8c0 0000000000000001 0000000000000000 c00000000c72ba20 GPR24: c00000000c72baa0 c00000000c72bac0 c000000001407248 c00000000c72ba20 GPR28: c00000000141fc80 c00000000c72bac0 c00000000c6bc790 0000000000000000 NIP [c0000000001d6868] cpuset_can_attach+0x58/0x1b0 LR [c0000000001d6858] cpuset_can_attach+0x48/0x1b0 Call Trace: [c00000000c72b9a0] [c0000000001d6858] cpuset_can_attach+0x48/0x1b0 (unreliable) [c00000000c72ba00] [c0000000001cbe80] cgroup_migrate_execute+0xb0/0x450 [c00000000c72ba80] [c0000000001d3754] cgroup_transfer_tasks+0x1c4/0x360 [c00000000c72bba0] [c0000000001d923c] cpuset_hotplug_workfn+0x86c/0xa20 [c00000000c72bca0] [c00000000011aa44] process_one_work+0x1e4/0x580 [c00000000c72bd30] [c00000000011ae78] worker_thread+0x98/0x5c0 [c00000000c72bdc0] [c000000000124058] kthread+0x168/0x1b0 [c00000000c72be30] [c00000000000b2e8] ret_from_kernel_thread+0x5c/0x74 Instruction dump: f821ffa1 7c7d1b78 60000000 60000000 38810020 7fa3eb78 3f42ffed 4bff4c25 60000000 3b5a0448 3d420020 eb610020 7f43d378 e9290000 f92af200 ---[ end trace dcaaf98fb36d9e64 ]--- This patch fixes the bug by adding an explicit nr_tasks counter to cgroup_taskset and skipping calling the migration methods if the counter is zero. While at it, remove the now spurious check on no source css_sets. Signed-off-by: Tejun Heo Reported-and-tested-by: Abdul Haleem Cc: Roman Gushchin Cc: stable@vger.kernel.org # v4.6+ Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Link: http://lkml.kernel.org/r/1497266622.15415.39.camel@abdul.in.ibm.com --- kernel/cgroup/cgroup-internal.h | 3 +++ kernel/cgroup/cgroup.c | 58 ++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 793565c05742..8b4c3c2f2509 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -33,6 +33,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the number of tasks in the set */ + int nr_tasks; + /* the subsys currently being processed */ int ssid; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 620794a20a33..cc53111072d8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2006,6 +2006,8 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (!cset->mg_src_cgrp) return; + mgctx->tset.nr_tasks++; + list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, @@ -2094,21 +2096,19 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; - /* methods shouldn't be called if no task is actually migrating */ - if (list_empty(&tset->src_csets)) - return 0; - /* check that we can legitimately attach to the cgroup */ - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->can_attach) { - tset->ssid = ssid; - ret = ss->can_attach(tset); - if (ret) { - failed_ssid = ssid; - goto out_cancel_attach; + if (tset->nr_tasks) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); + if (ret) { + failed_ssid = ssid; + goto out_cancel_attach; + } } - } - } while_each_subsys_mask(); + } while_each_subsys_mask(); + } /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2137,25 +2137,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) */ tset->csets = &tset->dst_csets; - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->attach) { - tset->ssid = ssid; - ss->attach(tset); - } - } while_each_subsys_mask(); + if (tset->nr_tasks) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); + } + } while_each_subsys_mask(); + } ret = 0; goto out_release_tset; out_cancel_attach: - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ssid == failed_ssid) - break; - if (ss->cancel_attach) { - tset->ssid = ssid; - ss->cancel_attach(tset); - } - } while_each_subsys_mask(); + if (tset->nr_tasks) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { + if (ssid == failed_ssid) + break; + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); + } + } while_each_subsys_mask(); + } out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); -- cgit v1.2.3 From 44ee454670122a959112caaa7aad86d8cacab1ff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Jul 2017 10:50:14 -0400 Subject: semtimedop(): move compat to native ... and finally kill the sodding compat_convert_timespec() Signed-off-by: Al Viro --- kernel/compat.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 6f0a0e723a06..772e038d04d9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -200,29 +200,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts) } EXPORT_SYMBOL_GPL(compat_put_timespec); -int compat_convert_timespec(struct timespec __user **kts, - const void __user *cts) -{ - struct timespec ts; - struct timespec __user *uts; - - if (!cts || COMPAT_USE_64BIT_TIME) { - *kts = (struct timespec __user *)cts; - return 0; - } - - uts = compat_alloc_user_space(sizeof(ts)); - if (!uts) - return -EFAULT; - if (compat_get_timespec(&ts, cts)) - return -EFAULT; - if (copy_to_user(uts, &ts, sizeof(ts))) - return -EFAULT; - - *kts = uts; - return 0; -} - int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) { struct compat_itimerval v32; -- cgit v1.2.3 From 88e033e326307831922e34e7de0d694274dcc1c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 16 Jul 2017 21:40:30 -0400 Subject: cgroup: remove now unused list_head @pending in cgroup_apply_cftypes() Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 620794a20a33..4f02b5edd82c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3230,7 +3230,6 @@ restart: static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { - LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; -- cgit v1.2.3 From 788b950c62e06b02278a0fd380e1a0667996ce3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 16 Jul 2017 21:43:33 -0400 Subject: cgroup: distinguish local and children populated states cgrp->populated_cnt counts both local (the cgroup's populated css_sets) and subtree proper (populated children) so that it's only zero when the whole subtree, including self, is empty. This patch splits the counter into two so that local and children populated states are tracked separately. It allows finer-grained tests on the state of the hierarchy which will be used to replace css_set walking local populated test. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4f02b5edd82c..5fe2644cd0f3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -587,39 +587,44 @@ static bool css_set_populated(struct css_set *cset) } /** - * cgroup_update_populated - updated populated count of a cgroup + * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first - * task or losing the last. Update @cgrp->populated_cnt accordingly. The - * count is propagated towards root so that a given cgroup's populated_cnt - * is zero iff the cgroup and all its descendants don't contain any tasks. + * task or losing the last. Update @cgrp->nr_populated_* accordingly. The + * count is propagated towards root so that a given cgroup's + * nr_populated_children is zero iff none of its descendants contain any + * tasks. * - * @cgrp's interface file "cgroup.populated" is zero if - * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt - * changes from or to zero, userland is notified that the content of the - * interface file has changed. This can be used to detect when @cgrp and - * its descendants become populated or empty. + * @cgrp's interface file "cgroup.populated" is zero if both + * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and + * 1 otherwise. When the sum changes from or to zero, userland is notified + * that the content of the interface file has changed. This can be used to + * detect when @cgrp and its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { + struct cgroup *child = NULL; + int adj = populated ? 1 : -1; + lockdep_assert_held(&css_set_lock); do { - bool trigger; + bool was_populated = cgroup_is_populated(cgrp); - if (populated) - trigger = !cgrp->populated_cnt++; + if (!child) + cgrp->nr_populated_csets += adj; else - trigger = !--cgrp->populated_cnt; + cgrp->nr_populated_children += adj; - if (!trigger) + if (was_populated == cgroup_is_populated(cgrp)) break; cgroup1_check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); + child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -630,7 +635,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the - * ->populated_cnt of all associated cgroups accordingly. + * populated counters of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { @@ -653,7 +658,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated) * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * - * This function automatically handles populated_cnt updates and + * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ -- cgit v1.2.3 From 27f26753f8c063c4cd2f06318e977ae136574b28 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 16 Jul 2017 21:44:18 -0400 Subject: cgroup: replace css_set walking populated test with testing cgrp->nr_populated_csets Implement trivial cgroup_has_tasks() which tests whether cgrp->nr_populated_csets is zero and replace the explicit local populated test in cgroup_subtree_control(). This simplifies the code and cgroup_has_tasks() will be used in more places later. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5fe2644cd0f3..d5b62313c753 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -325,6 +325,11 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp) return NULL; } +static bool cgroup_has_tasks(struct cgroup *cgrp) +{ + return cgrp->nr_populated_csets; +} + /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { @@ -2971,28 +2976,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgroup_parent(cgrp)) { - struct cgrp_cset_link *link; - - /* - * Because namespaces pin csets too, @cgrp->cset_links - * might not be empty even when @cgrp is empty. Walk and - * verify each cset. - */ - spin_lock_irq(&css_set_lock); - - ret = 0; - list_for_each_entry(link, &cgrp->cset_links, cset_link) { - if (css_set_populated(link->cset)) { - ret = -EBUSY; - break; - } - } - - spin_unlock_irq(&css_set_lock); - - if (ret) - goto out_unlock; + if (enable && cgroup_parent(cgrp) && cgroup_has_tasks(cgrp)) { + ret = -EBUSY; + goto out_unlock; } /* save and update control masks and prepare csses */ -- cgit v1.2.3 From 546ac1ffb70d25b56c1126940e5ec639c4dd7413 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:28:56 -0700 Subject: bpf: add devmap, a map for storing net device references Device map (devmap) is a BPF map, primarily useful for networking applications, that uses a key to lookup a reference to a netdevice. The map provides a clean way for BPF programs to build virtual port to physical port maps. Additionally, it provides a scoping function for the redirect action itself allowing multiple optimizations. Future patches will leverage the map to provide batching at the XDP layer. Another optimization/feature, that is not yet implemented, would be to support multiple netdevices per key to support efficient multicast and broadcast support. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 3 + kernel/bpf/devmap.c | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 8 ++ 3 files changed, 275 insertions(+) create mode 100644 kernel/bpf/devmap.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1e5e658f2db..48e92705be59 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,6 +2,9 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +ifeq ($(CONFIG_NET),y) +obj-$(CONFIG_BPF_SYSCALL) += devmap.o +endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c new file mode 100644 index 000000000000..1a878356bd37 --- /dev/null +++ b/kernel/bpf/devmap.c @@ -0,0 +1,264 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* Devmaps primary use is as a backend map for XDP BPF helper call + * bpf_redirect_map(). Because XDP is mostly concerned with performance we + * spent some effort to ensure the datapath with redirect maps does not use + * any locking. This is a quick note on the details. + * + * We have three possible paths to get into the devmap control plane bpf + * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall + * will invoke an update, delete, or lookup operation. To ensure updates and + * deletes appear atomic from the datapath side xchg() is used to modify the + * netdev_map array. Then because the datapath does a lookup into the netdev_map + * array (read-only) from an RCU critical section we use call_rcu() to wait for + * an rcu grace period before free'ing the old data structures. This ensures the + * datapath always has a valid copy. However, the datapath does a "flush" + * operation that pushes any pending packets in the driver outside the RCU + * critical section. Each bpf_dtab_netdev tracks these pending operations using + * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed + * until all bits are cleared indicating outstanding flush operations have + * completed. + * + * BPF syscalls may race with BPF program calls on any of the update, delete + * or lookup operations. As noted above the xchg() operation also keep the + * netdev_map consistent in this case. From the devmap side BPF programs + * calling into these operations are the same as multiple user space threads + * making system calls. + */ +#include +#include +#include +#include +#include "percpu_freelist.h" +#include "bpf_lru_list.h" +#include "map_in_map.h" + +struct bpf_dtab_netdev { + struct net_device *dev; + int key; + struct rcu_head rcu; + struct bpf_dtab *dtab; +}; + +struct bpf_dtab { + struct bpf_map map; + struct bpf_dtab_netdev **netdev_map; +}; + +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +{ + struct bpf_dtab *dtab; + u64 cost; + int err; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags) + return ERR_PTR(-EINVAL); + + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + if (attr->value_size > KMALLOC_MAX_SIZE) + return ERR_PTR(-E2BIG); + + dtab = kzalloc(sizeof(*dtab), GFP_USER); + if (!dtab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + dtab->map.map_type = attr->map_type; + dtab->map.key_size = attr->key_size; + dtab->map.value_size = attr->value_size; + dtab->map.max_entries = attr->max_entries; + dtab->map.map_flags = attr->map_flags; + + err = -ENOMEM; + + /* make sure page count doesn't overflow */ + cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_dtab; + + dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(dtab->map.pages); + if (err) + goto free_dtab; + + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + sizeof(struct bpf_dtab_netdev *)); + if (!dtab->netdev_map) + goto free_dtab; + + return &dtab->map; + +free_dtab: + kfree(dtab); + return ERR_PTR(err); +} + +static void dev_map_free(struct bpf_map *map) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + int i; + + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete. The rcu critical section only guarantees + * no further reads against netdev_map. It does __not__ ensure pending + * flush operations (if any) are complete. + */ + synchronize_rcu(); + + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev) + continue; + + dev_put(dev->dev); + kfree(dev); + } + + /* At this point bpf program is detached and all pending operations + * _must_ be complete + */ + bpf_map_area_free(dtab->netdev_map); + kfree(dtab); +} + +static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= dtab->map.max_entries) { + *next = 0; + return 0; + } + + if (index == dtab->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or + * update happens in parallel here a dev_put wont happen until after reading the + * ifindex. + */ +static void *dev_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev; + u32 i = *(u32 *)key; + + if (i >= map->max_entries) + return NULL; + + dev = READ_ONCE(dtab->netdev_map[i]); + return dev ? &dev->dev->ifindex : NULL; +} + +static void __dev_map_entry_free(struct rcu_head *rcu) +{ + struct bpf_dtab_netdev *old_dev; + + old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + dev_put(old_dev->dev); + kfree(old_dev); +} + +static int dev_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + /* Use synchronize_rcu() here to ensure any rcu critical sections + * have completed, but this does not guarantee a flush has happened + * yet. Because driver side rcu_read_lock/unlock only protects the + * running XDP program. However, for pending flush operations the + * dev and ctx are stored in another per cpu map. And additionally, + * the driver tear down ensures all soft irqs are complete before + * removing the net device in the case of dev_put equals zero. + */ + old_dev = xchg(&dtab->netdev_map[k], NULL); + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + return 0; +} + +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct net *net = current->nsproxy->net_ns; + struct bpf_dtab_netdev *dev, *old_dev; + u32 i = *(u32 *)key; + u32 ifindex = *(u32 *)value; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= dtab->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + if (!ifindex) { + dev = NULL; + } else { + dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN); + if (!dev) + return -ENOMEM; + + dev->dev = dev_get_by_index(net, ifindex); + if (!dev->dev) { + kfree(dev); + return -EINVAL; + } + + dev->key = i; + dev->dtab = dtab; + } + + /* Use call_rcu() here to ensure rcu critical sections have completed + * Remembering the driver side flush operation will happen before the + * net device is removed. + */ + old_dev = xchg(&dtab->netdev_map[i], dev); + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; +} + +const struct bpf_map_ops dev_map_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_get_next_key, + .map_lookup_elem = dev_map_lookup_elem, + .map_update_elem = dev_map_update_elem, + .map_delete_elem = dev_map_delete_elem, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a86723c5b64..4016774d5cca 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1276,6 +1276,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + /* devmap returns a pointer to a live net_device ifindex that we cannot + * allow to be modified from bpf side. So do not allow lookup elements + * for now. + */ + case BPF_MAP_TYPE_DEVMAP: + if (func_id == BPF_FUNC_map_lookup_elem) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) -- cgit v1.2.3 From 97f91a7cf04ff605845c20948b8a80e54cbd3376 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:29:18 -0700 Subject: bpf: add bpf_redirect_map helper routine BPF programs can use the devmap with a bpf_redirect_map() helper routine to forward packets to netdevice in map. Signed-off-by: John Fastabend Signed-off-by: Jesper Dangaard Brouer Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 12 ++++++++++++ kernel/bpf/verifier.c | 4 ++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1a878356bd37..36dc13deb2e1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -159,6 +159,18 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev; + + if (key >= map->max_entries) + return NULL; + + dev = READ_ONCE(dtab->netdev_map[key]); + return dev ? dev->dev : NULL; +} + /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or * update happens in parallel here a dev_put wont happen until after reading the * ifindex. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4016774d5cca..df05d65f0c87 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1312,6 +1312,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; break; + case BPF_FUNC_redirect_map: + if (map->map_type != BPF_MAP_TYPE_DEVMAP) + goto error; + break; default: break; } -- cgit v1.2.3 From 11393cc9b9be2a1f61559e6fb9c27bc8fa20b1ff Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:29:40 -0700 Subject: xdp: Add batching support to redirect map For performance reasons we want to avoid updating the tail pointer in the driver tx ring as much as possible. To accomplish this we add batching support to the redirect path in XDP. This adds another ndo op "xdp_flush" that is used to inform the driver that it should bump the tail pointer on the TX ring. Signed-off-by: John Fastabend Signed-off-by: Jesper Dangaard Brouer Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 36dc13deb2e1..b2ef04a1c86a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -53,6 +53,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; + unsigned long int __percpu *flush_needed; }; static struct bpf_map *dev_map_alloc(union bpf_attr *attr) @@ -87,6 +88,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); + cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; @@ -97,6 +99,14 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + /* A per cpu bitfield with a bit per possible net device */ + dtab->flush_needed = __alloc_percpu( + BITS_TO_LONGS(attr->max_entries) * + sizeof(unsigned long), + __alignof__(unsigned long)); + if (!dtab->flush_needed) + goto free_dtab; + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *)); if (!dtab->netdev_map) @@ -105,6 +115,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) return &dtab->map; free_dtab: + free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -112,7 +123,7 @@ free_dtab: static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i; + int i, cpu; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -123,6 +134,18 @@ static void dev_map_free(struct bpf_map *map) */ synchronize_rcu(); + /* To ensure all pending flush operations have completed wait for flush + * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * Because the above synchronize_rcu() ensures the map is disconnected + * from the program we can assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + + while (!bitmap_empty(bitmap, dtab->map.max_entries)) + cpu_relax(); + } + for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -137,6 +160,7 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ + free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -159,6 +183,14 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +void __dev_map_insert_ctx(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); + + __set_bit(key, bitmap); +} + struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -171,6 +203,39 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) return dev ? dev->dev : NULL; } +/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled + * from the driver before returning from its napi->poll() routine. The poll() + * routine is called either from busy_poll context or net_rx_action signaled + * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the + * net device can be torn down. On devmap tear down we ensure the ctx bitmap + * is zeroed before completing to ensure all flush operations have completed. + */ +void __dev_map_flush(struct bpf_map *map) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); + u32 bit; + + for_each_set_bit(bit, bitmap, map->max_entries) { + struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); + struct net_device *netdev; + + /* This is possible if the dev entry is removed by user space + * between xdp redirect and flush op. + */ + if (unlikely(!dev)) + continue; + + netdev = dev->dev; + + __clear_bit(bit, bitmap); + if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush)) + continue; + + netdev->netdev_ops->ndo_xdp_flush(netdev); + } +} + /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or * update happens in parallel here a dev_put wont happen until after reading the * ifindex. @@ -188,11 +253,28 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->dev->ifindex : NULL; } +static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev) +{ + if (old_dev->dev->netdev_ops->ndo_xdp_flush) { + struct net_device *fl = old_dev->dev; + unsigned long *bitmap; + int cpu; + + for_each_online_cpu(cpu) { + bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu); + __clear_bit(old_dev->key, bitmap); + + fl->netdev_ops->ndo_xdp_flush(old_dev->dev); + } + } +} + static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *old_dev; old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + dev_map_flush_old(old_dev); dev_put(old_dev->dev); kfree(old_dev); } -- cgit v1.2.3 From 2ddf71e23cc246e95af72a6deed67b4a50a7b81c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:30:02 -0700 Subject: net: add notifier hooks for devmap bpf map The BPF map devmap holds a refcnt on the net_device structure when it is in the map. We need to do this to ensure on driver unload we don't lose a dev reference. However, its not very convenient to have to manually unload the map when destroying a net device so add notifier handlers to do the cleanup automatically. But this creates a race between update/destroy BPF syscall and programs and the unregister netdev hook. Unfortunately, the best I could come up with is either to live with requiring manual removal of net devices from the map before removing the net device OR to add a mutex in devmap to ensure the map is not modified while we are removing a device. The fallout also requires that BPF programs no longer update/delete the map from the BPF program side because the mutex may sleep and this can not be done from inside an rcu critical section. This is not a real problem though because I have not come up with any use cases where this is actually useful in practice. If/when we come up with a compelling user for this we may need to revisit this. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 2 +- 2 files changed, 74 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b2ef04a1c86a..899364d097f5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -34,6 +34,17 @@ * netdev_map consistent in this case. From the devmap side BPF programs * calling into these operations are the same as multiple user space threads * making system calls. + * + * Finally, any of the above may race with a netdev_unregister notifier. The + * unregister notifier must search for net devices in the map structure that + * contain a reference to the net device and remove them. This is a two step + * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) + * check to see if the ifindex is the same as the net_device being removed. + * Unfortunately, the xchg() operations do not protect against this. To avoid + * potentially removing incorrect objects the dev_map_list_mutex protects + * conflicting netdev unregister and BPF syscall operations. Updates and + * deletes from a BPF program (done in rcu critical section) are blocked + * because of this mutex. */ #include #include @@ -54,8 +65,12 @@ struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; unsigned long int __percpu *flush_needed; + struct list_head list; }; +static DEFINE_MUTEX(dev_map_list_mutex); +static LIST_HEAD(dev_map_list); + static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; @@ -112,6 +127,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_dtab; + mutex_lock(&dev_map_list_mutex); + list_add_tail(&dtab->list, &dev_map_list); + mutex_unlock(&dev_map_list_mutex); return &dtab->map; free_dtab: @@ -146,6 +164,11 @@ static void dev_map_free(struct bpf_map *map) cpu_relax(); } + /* Although we should no longer have datapath or bpf syscall operations + * at this point we we can still race with netdev notifier, hence the + * lock. + */ + mutex_lock(&dev_map_list_mutex); for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -160,6 +183,8 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ + list_del(&dtab->list); + mutex_unlock(&dev_map_list_mutex); free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); @@ -296,9 +321,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) * the driver tear down ensures all soft irqs are complete before * removing the net device in the case of dev_put equals zero. */ + mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[k], NULL); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); + mutex_unlock(&dev_map_list_mutex); return 0; } @@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, * Remembering the driver side flush operation will happen before the * net device is removed. */ + mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[i], dev); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); + mutex_unlock(&dev_map_list_mutex); return 0; } @@ -356,3 +385,47 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, }; + +static int dev_map_notification(struct notifier_block *notifier, + ulong event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_dtab *dtab; + int i; + + switch (event) { + case NETDEV_UNREGISTER: + mutex_lock(&dev_map_list_mutex); + list_for_each_entry(dtab, &dev_map_list, list) { + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev || + dev->dev->ifindex != netdev->ifindex) + continue; + dev = xchg(&dtab->netdev_map[i], NULL); + if (dev) + call_rcu(&dev->rcu, + __dev_map_entry_free); + } + } + mutex_unlock(&dev_map_list_mutex); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block dev_map_notifier = { + .notifier_call = dev_map_notification, +}; + +static int __init dev_map_init(void) +{ + register_netdevice_notifier(&dev_map_notifier); + return 0; +} + +subsys_initcall(dev_map_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df05d65f0c87..ebe9b38ff522 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1281,7 +1281,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * for now. */ case BPF_MAP_TYPE_DEVMAP: - if (func_id == BPF_FUNC_map_lookup_elem) + if (func_id != BPF_FUNC_redirect_map) goto error; break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: -- cgit v1.2.3 From 8f716c9b5febf6ed0f5fedb7c9407cd0c25b2796 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:16 -0500 Subject: x86/mm: Add support to access boot related data in the clear MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boot data (such as EFI related data) is not encrypted when the system is booted because UEFI/BIOS does not run with SME active. In order to access this data properly it needs to be mapped decrypted. Update early_memremap() to provide an arch specific routine to modify the pagetable protection attributes before they are applied to the new mapping. This is used to remove the encryption mask for boot related data. Update memremap() to provide an arch specific routine to determine if RAM remapping is allowed. RAM remapping will cause an encrypted mapping to be generated. By preventing RAM remapping, ioremap_cache() will be used instead, which will provide a decrypted mapping of the boot related data. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Matt Fleming Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/81fb6b4117a5df6b9f2eda342f81bbef4b23d2e5.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- kernel/memremap.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 124bed776532..9afdc434fb49 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size) } #endif -static void *try_ram_remap(resource_size_t offset, size_t size) +#ifndef arch_memremap_can_ram_remap +static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + return true; +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) { unsigned long pfn = PHYS_PFN(offset); /* In the simple case just return the existing linear address */ - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && + arch_memremap_can_ram_remap(offset, size, flags)) return __va(offset); + return NULL; /* fallback to arch_memremap_wb */ } @@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address * @size: size of remap - * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, + * MEMREMAP_ENC, MEMREMAP_DEC * * memremap() is "ioremap" for cases where it is known that the resource * being mapped does not have i/o side effects and the __iomem @@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) * the requested range is potentially in System RAM. */ if (is_ram == REGION_INTERSECTS) - addr = try_ram_remap(offset, size); + addr = try_ram_remap(offset, size, flags); if (!addr) addr = arch_memremap_wb(offset, size); } -- cgit v1.2.3 From bba4ed011a52d494aa7ef5e08cf226709bbf3f60 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:28 -0500 Subject: x86/mm, kexec: Allow kexec to be used with SME MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide support so that kexec can be used to boot a kernel when SME is enabled. Support is needed to allocate pages for kexec without encryption. This is needed in order to be able to reboot in the kernel in the same manner as originally booted. Additionally, when shutting down all of the CPUs we need to be sure to flush the caches and then halt. This is needed when booting from a state where SME was not active into a state where SME is active (or vice-versa). Without these steps, it is possible for cache lines to exist for the same physical location but tagged both with and without the encryption bit. This can cause random memory corruption when caches are flushed depending on which cacheline is written last. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/b95ff075db3e7cd545313f2fb609a49619a09625.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- kernel/kexec_core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1ae7c41c33c1..20fef1a38602 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; - pages = alloc_pages(gfp_mask, order); + pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; @@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); + + arch_kexec_post_alloc_pages(page_address(pages), count, + gfp_mask); + + if (gfp_mask & __GFP_ZERO) + for (i = 0; i < count; i++) + clear_highpage(pages + i); } return pages; @@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page) order = page_private(page); count = 1 << order; + + arch_kexec_pre_free_pages(page_address(page), count); + for (i = 0; i < count; i++) ClearPageReserved(page + i); __free_pages(page, order); -- cgit v1.2.3 From 7af608e4f9530372aec6e940552bf76595f2e265 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jul 2017 17:57:46 -0400 Subject: cgroup: create dfl_root files on subsys registration On subsystem registration, css_populate_dir() is not called on the new root css, so the interface files for the subsystem on cgrp_dfl_root aren't created on registration. This is a residue from the days when cgrp_dfl_root was used only as the parking spot for unused subsystems, which no longer is true as it's used as the root for cgroup2. This is often fine as later operations tend to create them as a part of mount (cgroup1) or subtree_control operations (cgroup2); however, it's not difficult to mount cgroup2 with the controller interface files missing as Waiman found out. Fix it by invoking css_populate_dir() on the root css on subsys registration. Signed-off-by: Tejun Heo Reported-and-tested-by: Waiman Long Cc: stable@vger.kernel.org # v4.5+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cc53111072d8..744975947d01 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4673,6 +4673,10 @@ int __init cgroup_init(void) if (ss->bind) ss->bind(init_css_set.subsys[ssid]); + + mutex_lock(&cgroup_mutex); + css_populate_dir(init_css_set.subsys[ssid]); + mutex_unlock(&cgroup_mutex); } /* init_css_set.subsys[] has been updated, re-hash */ -- cgit v1.2.3 From 5c0338c68706be53b3dc472e4308961c36e4ece1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jul 2017 18:41:52 -0400 Subject: workqueue: restore WQ_UNBOUND/max_active==1 to be ordered The combination of WQ_UNBOUND and max_active == 1 used to imply ordered execution. After NUMA affinity 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues"), this is no longer true due to per-node worker pools. While the right way to create an ordered workqueue is alloc_ordered_workqueue(), the documentation has been misleading for a long time and people do use WQ_UNBOUND and max_active == 1 for ordered workqueues which can lead to subtle bugs which are very difficult to trigger. It's unlikely that we'd see noticeable performance impact by enforcing ordering on WQ_UNBOUND / max_active == 1 workqueues. Let's automatically set __WQ_ORDERED for those workqueues. Signed-off-by: Tejun Heo Reported-by: Christoph Hellwig Reported-by: Alexei Potashnik Fixes: 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues") Cc: stable@vger.kernel.org # v3.10+ --- kernel/workqueue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a86688fabc55..abe4a4971c24 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3929,6 +3929,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* + * Unbound && max_active == 1 used to imply ordered, which is no + * longer the case on NUMA machines due to per-node pools. While + * alloc_ordered_workqueue() is the right way to create an ordered + * workqueue, keep the previous behavior to avoid subtle breakages + * on NUMA. + */ + if ((flags & WQ_UNBOUND) && max_active == 1) + flags |= __WQ_ORDERED; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; -- cgit v1.2.3 From a2b426267c56773201f968fdb5eda6ab9ae94e34 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 29 Apr 2017 14:12:15 -0500 Subject: userns,pidns: Verify the userns for new pid namespaces It is pointless and confusing to allow a pid namespace hierarchy and the user namespace hierarchy to get out of sync. The owner of a child pid namespace should be the owner of the parent pid namespace or a descendant of the owner of the parent pid namespace. Otherwise it is possible to construct scenarios where a process has a capability over a parent pid namespace but does not have the capability over a child pid namespace. Which confusingly makes permission checks non-transitive. It requires use of setns into a pid namespace (but not into a user namespace) to create such a scenario. Add the function in_userns to help in making this determination. v2: Optimized in_userns by using level as suggested by: Kirill Tkhai Ref: 49f4d8b93ccf ("pidns: Capture the user namespace and filter ns_last_pid") Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 4 ++++ kernel/user_namespace.c | 20 ++++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 74a5a7255b4d..4918314893bc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns int i; int err; + err = -EINVAL; + if (!in_userns(parent_pid_ns->user_ns, user_ns)) + goto out; + err = -ENOSPC; if (level > MAX_PID_NS_LEVEL) goto out; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 2f735cbe05e8..c490f1e4313b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -986,17 +986,21 @@ bool userns_may_setgroups(const struct user_namespace *ns) } /* - * Returns true if @ns is the same namespace as or a descendant of - * @target_ns. + * Returns true if @child is the same namespace or a descendant of + * @ancestor. */ +bool in_userns(const struct user_namespace *ancestor, + const struct user_namespace *child) +{ + const struct user_namespace *ns; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return (ns == ancestor); +} + bool current_in_userns(const struct user_namespace *target_ns) { - struct user_namespace *ns; - for (ns = current_user_ns(); ns; ns = ns->parent) { - if (ns == target_ns) - return true; - } - return false; + return in_userns(target_ns, current_user_ns()); } static inline struct user_namespace *to_user_ns(struct ns_common *ns) -- cgit v1.2.3 From 4d28df6152aa3ffd0ad0389bb1d31f5b1c1c2b1f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 May 2017 17:33:36 +0300 Subject: prctl: Allow local CAP_SYS_ADMIN changing exe_file During checkpointing and restore of userspace tasks we bumped into the situation, that it's not possible to restore the tasks, which user namespace does not have uid 0 or gid 0 mapped. People create user namespace mappings like they want, and there is no a limitation on obligatory uid and gid "must be mapped". So, if there is no uid 0 or gid 0 in the mapping, it's impossible to restore mm->exe_file of the processes belonging to this user namespace. Also, there is no a workaround. It's impossible to create a temporary uid/gid mapping, because only one write to /proc/[pid]/uid_map and gid_map is allowed during a namespace lifetime. If there is an entry, then no more mapings can't be written. If there isn't an entry, we can't write there too, otherwise user task won't be able to do that in the future. The patch changes the check, and looks for CAP_SYS_ADMIN instead of zero uid and gid. This allows to restore a task independently of its user namespace mappings. Signed-off-by: Kirill Tkhai CC: Andrew Morton CC: Serge Hallyn CC: "Eric W. Biederman" CC: Oleg Nesterov CC: Michal Hocko CC: Andrei Vagin CC: Cyrill Gorcunov CC: Stanislav Kinsburskiy CC: Pavel Tikhomirov Reviewed-by: Cyrill Gorcunov Signed-off-by: Eric W. Biederman --- kernel/sys.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 2855ee73acd0..9aebc2935013 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1896,15 +1896,11 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) /* * Finally, make sure the caller has the rights to - * change /proc/pid/exe link: only local root should + * change /proc/pid/exe link: only local sys admin should * be allowed to. */ if (prctl_map->exe_fd != (u32)-1) { - struct user_namespace *ns = current_user_ns(); - const struct cred *cred = current_cred(); - - if (!uid_eq(cred->uid, make_kuid(ns, 0)) || - !gid_eq(cred->gid, make_kgid(ns, 0))) + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) goto out; } -- cgit v1.2.3 From 0c96b27305faf06c068b45e07d28336c80dac286 Mon Sep 17 00:00:00 2001 From: Ethan Barnes Date: Wed, 19 Jul 2017 22:36:00 +0000 Subject: smp/hotplug: Handle removal correctly in cpuhp_store_callbacks() If cpuhp_store_callbacks() is called for CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN, which are the indicators for dynamically allocated states, then cpuhp_store_callbacks() allocates a new dynamic state. The first allocation in each range returns CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN. If cpuhp_remove_state() is invoked for one of these states, then there is no protection against the allocation mechanism. So the removal, which should clear the callbacks and the name, gets a new state assigned and clears that one. As a consequence the state which should be cleared stays initialized. A consecutive CPU hotplug operation dereferences the state callbacks and accesses either freed or reused memory, resulting in crashes. Add a protection against this by checking the name argument for NULL. If it's NULL it's a removal. If not, it's an allocation. [ tglx: Added a comment and massaged changelog ] Fixes: 5b7aa87e0482 ("cpu/hotplug: Implement setup/removal interface") Signed-off-by: Ethan Barnes Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Cc: "Srivatsa S. Bhat" Cc: Sebastian Siewior Cc: Paul McKenney Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/DM2PR04MB398242FC7776D603D9F99C894A60@DM2PR04MB398.namprd04.prod.outlook.com --- kernel/cpu.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index eee033134262..a88c29ab09be 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1252,7 +1252,17 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name, struct cpuhp_step *sp; int ret = 0; - if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) { + /* + * If name is NULL, then the state gets removed. + * + * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on + * the first allocation from these dynamic ranges, so the removal + * would trigger a new allocation and clear the wrong (already + * empty) state, leaving the callbacks of the to be cleared state + * dangling, which causes wreckage on the next hotplug operation. + */ + if (name && (state == CPUHP_AP_ONLINE_DYN || + state == CPUHP_BP_PREPARE_DYN)) { ret = cpuhp_reserve_state(state); if (ret < 0) return ret; -- cgit v1.2.3 From 715c809d9a9e38d8fb9476757ddaf64c1a9f767f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 May 2017 09:34:00 -0400 Subject: cgroup: reorganize cgroup.procs / task write path Currently, writes "cgroup.procs" and "cgroup.tasks" files are all handled by __cgroup_procs_write() on both v1 and v2. This patch reoragnizes the write path so that there are common helper functions that different write paths use. While this somewhat increases LOC, the different paths are no longer intertwined and each path has more flexibility to implement different behaviors which will be necessary for the planned v2 thread support. v3: - Restructured so that cgroup_procs_write_permission() takes @src_cgrp and @dst_cgrp. v2: - Rolled in Waiman's task reference count fix. - Updated on top of nsdelegate changes. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/cgroup-internal.h | 8 +- kernel/cgroup/cgroup-v1.c | 58 ++++++++++-- kernel/cgroup/cgroup.c | 192 ++++++++++++++++++++-------------------- 3 files changed, 152 insertions(+), 106 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 793565c05742..0e81c6109e91 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -180,10 +180,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off, bool threadgroup); -ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, - loff_t off); +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) + __acquires(&cgroup_threadgroup_rwsem); +void cgroup_procs_write_finish(struct task_struct *task) + __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7bf4b1533f34..60f72475863e 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -510,10 +510,58 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) return 0; } -static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + bool threadgroup) { - return __cgroup_procs_write(of, buf, nbytes, off, false); + struct cgroup *cgrp; + struct task_struct *task; + const struct cred *cred, *tcred; + ssize_t ret; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + task = cgroup_procs_write_start(buf, threadgroup); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + cred = current_cred(); + tcred = get_task_cred(task); + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; + put_cred(tcred); + if (ret) + goto out_finish; + + ret = cgroup_attach_task(cgrp, task, threadgroup); + +out_finish: + cgroup_procs_write_finish(task); +out_unlock: + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup1_procs_write(of, buf, nbytes, off, true); +} + +static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup1_procs_write(of, buf, nbytes, off, false); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, @@ -592,7 +640,7 @@ struct cftype cgroup1_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, - .write = cgroup_procs_write, + .write = cgroup1_procs_write, }, { .name = "cgroup.clone_children", @@ -611,7 +659,7 @@ struct cftype cgroup1_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, - .write = cgroup_tasks_write, + .write = cgroup1_tasks_write, }, { .name = "notify_on_release", diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d5b62313c753..e3bda0752501 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2421,96 +2421,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -static int cgroup_procs_write_permission(struct task_struct *task, - struct cgroup *dst_cgrp, - struct kernfs_open_file *of) -{ - struct super_block *sb = of->file->f_path.dentry->d_sb; - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; - struct cgroup *src_cgrp, *com_cgrp; - struct inode *inode; - int ret; - - if (!cgroup_on_dfl(dst_cgrp)) { - const struct cred *cred = current_cred(); - const struct cred *tcred = get_task_cred(task); - - /* - * even if we're attaching all tasks in the thread group, - * we only need to check permissions on one of them. - */ - if (uid_eq(cred->euid, GLOBAL_ROOT_UID) || - uid_eq(cred->euid, tcred->uid) || - uid_eq(cred->euid, tcred->suid)) - ret = 0; - else - ret = -EACCES; - - put_cred(tcred); - return ret; - } - - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - /* and the common ancestor */ - com_cgrp = src_cgrp; - while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) - com_cgrp = cgroup_parent(com_cgrp); - - /* %current should be authorized to migrate to the common ancestor */ - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); - if (!inode) - return -ENOMEM; - - ret = inode_permission(inode, MAY_WRITE); - iput(inode); - if (ret) - return ret; - - /* - * If namespaces are delegation boundaries, %current must be able - * to see both source and destination cgroups from its namespace. - */ - if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && - (!cgroup_is_descendant(src_cgrp, root_cgrp) || - !cgroup_is_descendant(dst_cgrp, root_cgrp))) - return -ENOENT; - - return 0; -} - -/* - * Find the task_struct of the task to attach by vpid and pass it along to the - * function to attach either it or all tasks in its threadgroup. Will lock - * cgroup_mutex and threadgroup. - */ -ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) + __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; - struct cgroup_subsys *ss; - struct cgroup *cgrp; pid_t pid; - int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return -EINVAL; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; + return ERR_PTR(-EINVAL); percpu_down_write(&cgroup_threadgroup_rwsem); + rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { - ret = -ESRCH; - goto out_unlock_rcu; + tsk = ERR_PTR(-ESRCH); + goto out_unlock_threadgroup; } } else { tsk = current; @@ -2526,35 +2453,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { - ret = -EINVAL; - goto out_unlock_rcu; + tsk = ERR_PTR(-EINVAL); + goto out_unlock_threadgroup; } get_task_struct(tsk); + goto out_unlock_rcu; + +out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); +out_unlock_rcu: rcu_read_unlock(); + return tsk; +} - ret = cgroup_procs_write_permission(tsk, cgrp, of); - if (!ret) - ret = cgroup_attach_task(cgrp, tsk, threadgroup); +void cgroup_procs_write_finish(struct task_struct *task) + __releases(&cgroup_threadgroup_rwsem) +{ + struct cgroup_subsys *ss; + int ssid; - put_task_struct(tsk); - goto out_unlock_threadgroup; + /* release reference from cgroup_procs_write_start() */ + put_task_struct(task); -out_unlock_rcu: - rcu_read_unlock(); -out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); - cgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, - loff_t off) -{ - return __cgroup_procs_write(of, buf, nbytes, off, true); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3870,6 +3795,79 @@ static int cgroup_procs_show(struct seq_file *s, void *v) return 0; } +static int cgroup_procs_write_permission(struct cgroup *src_cgrp, + struct cgroup *dst_cgrp, + struct super_block *sb) +{ + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup *com_cgrp = src_cgrp; + struct inode *inode; + int ret; + + lockdep_assert_held(&cgroup_mutex); + + /* find the common ancestor */ + while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) + com_cgrp = cgroup_parent(com_cgrp); + + /* %current should be authorized to migrate to the common ancestor */ + inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + if (ret) + return ret; + + /* + * If namespaces are delegation boundaries, %current must be able + * to see both source and destination cgroups from its namespace. + */ + if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && + (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || + !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) + return -ENOENT; + + return 0; +} + +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *src_cgrp, *dst_cgrp; + struct task_struct *task; + ssize_t ret; + + dst_cgrp = cgroup_kn_lock_live(of->kn, false); + if (!dst_cgrp) + return -ENODEV; + + task = cgroup_procs_write_start(buf, true); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; + + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb); + if (ret) + goto out_finish; + + ret = cgroup_attach_task(dst_cgrp, task, true); + +out_finish: + cgroup_procs_write_finish(task); +out_unlock: + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_base_files[] = { { -- cgit v1.2.3 From bc2fb7ed089ffd16d26e1d95b898a37d2b37d201 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 May 2017 09:34:01 -0400 Subject: cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS css_task_iter currently always walks all tasks. With the scheduled cgroup v2 thread support, the iterator would need to handle multiple types of iteration. As a preparation, add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS. If the flag is not specified, it walks all tasks as before. When asserted, the iterator only walks the group leaders. For now, the only user of the flag is cgroup v2 "cgroup.procs" file which no longer needs to skip non-leader tasks in cgroup_procs_next(). Note that cgroup v1 "cgroup.procs" can't use the group leader walk as v1 "cgroup.procs" doesn't mean "list all thread group leaders in the cgroup" but "list all thread group id's with any threads in the cgroup". While at it, update cgroup_procs_show() to use task_pid_vnr() instead of task_tgid_vnr(). As the iteration guarantees that the function only sees group leaders, this doesn't change the output and will allow sharing the function for thread iteration. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 6 +++--- kernel/cgroup/cgroup.c | 24 ++++++++++++++---------- kernel/cgroup/cpuset.c | 6 +++--- kernel/cgroup/freezer.c | 6 +++--- 4 files changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 60f72475863e..167aaab04bf9 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -121,7 +121,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->self, &it); + css_task_iter_start(&from->self, 0, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); @@ -373,7 +373,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->self, &it); + css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -749,7 +749,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } rcu_read_unlock(); - css_task_iter_start(&cgrp->self, &it); + css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e3bda0752501..3c5a37a9a892 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3643,6 +3643,7 @@ static void css_task_iter_advance(struct css_task_iter *it) lockdep_assert_held(&css_set_lock); WARN_ON_ONCE(!l); +repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the @@ -3657,11 +3658,18 @@ static void css_task_iter_advance(struct css_task_iter *it) css_task_iter_advance_css_set(it); else it->task_pos = l; + + /* if PROCS, skip over tasks which aren't group leaders */ + if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && + !thread_group_leader(list_entry(it->task_pos, struct task_struct, + cg_list))) + goto repeat; } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of + * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call @@ -3669,7 +3677,7 @@ static void css_task_iter_advance(struct css_task_iter *it) * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ -void css_task_iter_start(struct cgroup_subsys_state *css, +void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { /* no one should try to iterate before mounting cgroups */ @@ -3680,6 +3688,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, spin_lock_irq(&css_set_lock); it->ss = css->ss; + it->flags = flags; if (it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; @@ -3753,13 +3762,8 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; - struct task_struct *task; - - do { - task = css_task_iter_next(it); - } while (task && !thread_group_leader(task)); - return task; + return css_task_iter_next(it); } static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) @@ -3780,10 +3784,10 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) if (!it) return ERR_PTR(-ENOMEM); of->priv = it; - css_task_iter_start(&cgrp->self, it); + css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it); } else if (!(*pos)++) { css_task_iter_end(it); - css_task_iter_start(&cgrp->self, it); + css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it); } return cgroup_procs_next(s, NULL, NULL); @@ -3791,7 +3795,7 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) static int cgroup_procs_show(struct seq_file *s, void *v) { - seq_printf(s, "%d\n", task_tgid_vnr(v)); + seq_printf(s, "%d\n", task_pid_vnr(v)); return 0; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca8376e5008c..252d70c9a49b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -861,7 +861,7 @@ static void update_tasks_cpumask(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); @@ -1091,7 +1091,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; @@ -1284,7 +1284,7 @@ static void update_tasks_flags(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) cpuset_update_task_spread_flag(cs, task); css_task_iter_end(&it); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 1b72d56edce5..08236798d173 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) rcu_read_unlock(); /* are all tasks frozen? */ - css_task_iter_start(css, &it); + css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { if (freezing(task)) { @@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, &it); + css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) freeze_task(task); css_task_iter_end(&it); @@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, &it); + css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) __thaw_task(task); css_task_iter_end(&it); -- cgit v1.2.3 From 454000adaa2a7420df6e56a42f22726d05872a3f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 May 2017 09:34:02 -0400 Subject: cgroup: introduce cgroup->dom_cgrp and threaded css_set handling cgroup v2 is in the process of growing thread granularity support. A threaded subtree is composed of a thread root and threaded cgroups which are proper members of the subtree. The root cgroup of the subtree serves as the domain cgroup to which the processes (as opposed to threads / tasks) of the subtree conceptually belong and domain-level resource consumptions not tied to any specific task are charged. Inside the subtree, threads won't be subject to process granularity or no-internal-task constraint and can be distributed arbitrarily across the subtree. This patch introduces cgroup->dom_cgrp along with threaded css_set handling. * cgroup->dom_cgrp points to self for normal and thread roots. For proper thread subtree members, points to the dom_cgrp (the thread root). * css_set->dom_cset points to self if for normal and thread roots. If threaded, points to the css_set which belongs to the cgrp->dom_cgrp. The dom_cgrp serves as the resource domain and keeps the matching csses available. The dom_cset holds those csses and makes them easily accessible. * All threaded csets are linked on their dom_csets to enable iteration of all threaded tasks. * cgroup->nr_threaded_children keeps track of the number of threaded children. This patch adds the above but doesn't actually use them yet. The following patches will build on top. v4: ->nr_threaded_children added. v3: ->proc_cgrp/cset renamed to ->dom_cgrp/cset. Updated for the new enable-threaded-per-cgroup behavior. v2: Added cgroup_is_threaded() helper. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3c5a37a9a892..c7e1c243b77d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -330,6 +330,11 @@ static bool cgroup_has_tasks(struct cgroup *cgrp) return cgrp->nr_populated_csets; } +static bool cgroup_is_threaded(struct cgroup *cgrp) +{ + return cgrp->dom_cgrp != cgrp; +} + /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { @@ -565,9 +570,11 @@ EXPORT_SYMBOL_GPL(of_css); */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), + .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), + .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), @@ -575,6 +582,11 @@ struct css_set init_css_set = { static int css_set_count = 1; /* 1 for init_css_set */ +static bool css_set_threaded(struct css_set *cset) +{ + return cset->dom_cset != cset; +} + /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set @@ -618,10 +630,14 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) do { bool was_populated = cgroup_is_populated(cgrp); - if (!child) + if (!child) { cgrp->nr_populated_csets += adj; - else - cgrp->nr_populated_children += adj; + } else { + if (cgroup_is_threaded(child)) + cgrp->nr_populated_threaded_children += adj; + else + cgrp->nr_populated_domain_children += adj; + } if (was_populated == cgroup_is_populated(cgrp)) break; @@ -747,6 +763,8 @@ void put_css_set_locked(struct css_set *cset) if (!refcount_dec_and_test(&cset->refcount)) return; + WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); + /* This css_set is dead. unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); @@ -763,6 +781,11 @@ void put_css_set_locked(struct css_set *cset) kfree(link); } + if (css_set_threaded(cset)) { + list_del(&cset->threaded_csets_node); + put_css_set_locked(cset->dom_cset); + } + kfree_rcu(cset, rcu_head); } @@ -781,6 +804,7 @@ static bool compare_css_sets(struct css_set *cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { + struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* @@ -791,6 +815,16 @@ static bool compare_css_sets(struct css_set *cset, if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; + + /* @cset's domain should match the default cgroup's */ + if (cgroup_on_dfl(new_cgrp)) + new_dfl_cgrp = new_cgrp; + else + new_dfl_cgrp = old_cset->dfl_cgrp; + + if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) + return false; + /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may @@ -998,9 +1032,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, } refcount_set(&cset->refcount, 1); + cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->task_iters); + INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_preload_node); @@ -1038,6 +1074,28 @@ static struct css_set *find_css_set(struct css_set *old_cset, spin_unlock_irq(&css_set_lock); + /* + * If @cset should be threaded, look up the matching dom_cset and + * link them up. We first fully initialize @cset then look for the + * dom_cset. It's simpler this way and safe as @cset is guaranteed + * to stay empty until we return. + */ + if (cgroup_is_threaded(cset->dfl_cgrp)) { + struct css_set *dcset; + + dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); + if (!dcset) { + put_css_set(cset); + return NULL; + } + + spin_lock_irq(&css_set_lock); + cset->dom_cset = dcset; + list_add_tail(&cset->threaded_csets_node, + &dcset->threaded_csets); + spin_unlock_irq(&css_set_lock); + } + return cset; } @@ -1680,6 +1738,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; + cgrp->dom_cgrp = cgrp; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -4408,6 +4467,7 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { + struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; @@ -4452,6 +4512,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); + if (parent && cgroup_is_threaded(cgrp)) + parent->nr_threaded_children--; + cgroup1_check_for_release(cgroup_parent(cgrp)); /* put the base reference */ -- cgit v1.2.3 From 450ee0c1feed657894e0b4bdd48f3974af9d394c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 May 2017 09:34:03 -0400 Subject: cgroup: implement CSS_TASK_ITER_THREADED cgroup v2 is in the process of growing thread granularity support. Once thread mode is enabled, the root cgroup of the subtree serves as the dom_cgrp to which the processes of the subtree conceptually belong and domain-level resource consumptions not tied to any specific task are charged. In the subtree, threads won't be subject to process granularity or no-internal-task constraint and can be distributed arbitrarily across the subtree. This patch implements a new task iterator flag CSS_TASK_ITER_THREADED, which, when used on a dom_cgrp, makes the iteration include the tasks on all the associated threaded css_sets. "cgroup.procs" read path is updated to use it so that reading the file on a proc_cgrp lists all processes. This will also be used by controller implementations which need to walk processes or tasks at the resource domain level. Task iteration is implemented nested in css_set iteration. If CSS_TASK_ITER_THREADED is specified, after walking tasks of each !threaded css_set, all the associated threaded css_sets are visited before moving onto the next !threaded css_set. v2: ->cur_pcset renamed to ->cur_dcset. Updated for the new enable-threaded-per-cgroup behavior. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 77 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c7e1c243b77d..a1d59af274a9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3629,6 +3629,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css) return ret; } +static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) +{ + struct list_head *l; + struct cgrp_cset_link *link; + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + /* find the next threaded cset */ + if (it->tcset_pos) { + l = it->tcset_pos->next; + + if (l != it->tcset_head) { + it->tcset_pos = l; + return container_of(l, struct css_set, + threaded_csets_node); + } + + it->tcset_pos = NULL; + } + + /* find the next cset */ + l = it->cset_pos; + l = l->next; + if (l == it->cset_head) { + it->cset_pos = NULL; + return NULL; + } + + if (it->ss) { + cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } + + it->cset_pos = l; + + /* initialize threaded css_set walking */ + if (it->flags & CSS_TASK_ITER_THREADED) { + if (it->cur_dcset) + put_css_set_locked(it->cur_dcset); + it->cur_dcset = cset; + get_css_set(cset); + + it->tcset_head = &cset->threaded_csets; + it->tcset_pos = &cset->threaded_csets; + } + + return cset; +} + /** * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance @@ -3637,32 +3689,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css) */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { - struct list_head *l = it->cset_pos; - struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set */ do { - l = l->next; - if (l == it->cset_head) { - it->cset_pos = NULL; + cset = css_task_iter_next_css_set(it); + if (!cset) { it->task_pos = NULL; return; } - - if (it->ss) { - cset = container_of(l, struct css_set, - e_cset_node[it->ss->id]); - } else { - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } } while (!css_set_populated(cset)); - it->cset_pos = l; - if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; else @@ -3805,6 +3844,9 @@ void css_task_iter_end(struct css_task_iter *it) spin_unlock_irq(&css_set_lock); } + if (it->cur_dcset) + put_css_set(it->cur_dcset); + if (it->cur_task) put_task_struct(it->cur_task); } @@ -3830,6 +3872,7 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct css_task_iter *it = of->priv; + unsigned iter_flags = CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED; /* * When a seq_file is seeked, it's always traversed sequentially @@ -3843,10 +3886,10 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) if (!it) return ERR_PTR(-ENOMEM); of->priv = it; - css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it); + css_task_iter_start(&cgrp->self, iter_flags, it); } else if (!(*pos)++) { css_task_iter_end(it); - css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it); + css_task_iter_start(&cgrp->self, iter_flags, it); } return cgroup_procs_next(s, NULL, NULL); -- cgit v1.2.3 From 8cfd8147df67e741d93b8783a3ea8f3c74f93a0e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 21 Jul 2017 11:14:51 -0400 Subject: cgroup: implement cgroup v2 thread support This patch implements cgroup v2 thread support. The goal of the thread mode is supporting hierarchical accounting and control at thread granularity while staying inside the resource domain model which allows coordination across different resource controllers and handling of anonymous resource consumptions. A cgroup is always created as a domain and can be made threaded by writing to the "cgroup.type" file. When a cgroup becomes threaded, it becomes a member of a threaded subtree which is anchored at the closest ancestor which isn't threaded. The threads of the processes which are in a threaded subtree can be placed anywhere without being restricted by process granularity or no-internal-process constraint. Note that the threads aren't allowed to escape to a different threaded subtree. To be used inside a threaded subtree, a controller should explicitly support threaded mode and be able to handle internal competition in the way which is appropriate for the resource. The root of a threaded subtree, the nearest ancestor which isn't threaded, is called the threaded domain and serves as the resource domain for the whole subtree. This is the last cgroup where domain controllers are operational and where all the domain-level resource consumptions in the subtree are accounted. This allows threaded controllers to operate at thread granularity when requested while staying inside the scope of system-level resource distribution. As the root cgroup is exempt from the no-internal-process constraint, it can serve as both a threaded domain and a parent to normal cgroups, so, unlike non-root cgroups, the root cgroup can have both domain and threaded children. Internally, in a threaded subtree, each css_set has its ->dom_cset pointing to a matching css_set which belongs to the threaded domain. This ensures that thread root level cgroup_subsys_state for all threaded controllers are readily accessible for domain-level operations. This patch enables threaded mode for the pids and perf_events controllers. Neither has to worry about domain-level resource consumptions and it's enough to simply set the flag. For more details on the interface and behavior of the thread mode, please refer to the section 2-2-2 in Documentation/cgroup-v2.txt added by this patch. v5: - Dropped silly no-op ->dom_cgrp init from cgroup_create(). Spotted by Waiman. - Documentation updated as suggested by Waiman. - cgroup.type content slightly reformatted. - Mark the debug controller threaded. v4: - Updated to the general idea of marking specific cgroups domain/threaded as suggested by PeterZ. v3: - Dropped "join" and always make mixed children join the parent's threaded subtree. v2: - After discussions with Waiman, support for mixed thread mode is added. This should address the issue that Peter pointed out where any nesting should be avoided for thread subtrees while coexisting with other domain cgroups. - Enabling / disabling thread mode now piggy backs on the existing control mask update mechanism. - Bug fixes and cleanup. Signed-off-by: Tejun Heo Cc: Waiman Long Cc: Peter Zijlstra --- kernel/cgroup/cgroup-internal.h | 2 +- kernel/cgroup/cgroup-v1.c | 5 +- kernel/cgroup/cgroup.c | 355 +++++++++++++++++++++++++++++++++++++--- kernel/cgroup/debug.c | 1 + kernel/cgroup/pids.c | 1 + kernel/events/core.c | 1 + 6 files changed, 340 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 0e81c6109e91..f10eb19ddf04 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -170,7 +170,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_root *root, unsigned long magic, struct cgroup_namespace *ns); -bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); +int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 167aaab04bf9..f0e8601b13cb 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (cgroup_on_dfl(to)) return -EINVAL; - if (!cgroup_may_migrate_to(to)) - return -EBUSY; + ret = cgroup_migrate_vet_dst(to); + if (ret) + return ret; mutex_lock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a1d59af274a9..c396e701c206 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask; +/* some controllers can be threaded on the default hierarchy */ +static u16 cgrp_dfl_threaded_ss_mask; + /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; @@ -335,14 +338,93 @@ static bool cgroup_is_threaded(struct cgroup *cgrp) return cgrp->dom_cgrp != cgrp; } +/* can @cgrp host both domain and threaded children? */ +static bool cgroup_is_mixable(struct cgroup *cgrp) +{ + /* + * Root isn't under domain level resource control exempting it from + * the no-internal-process constraint, so it can serve as a thread + * root and a parent of resource domains at the same time. + */ + return !cgroup_parent(cgrp); +} + +/* can @cgrp become a thread root? should always be true for a thread root */ +static bool cgroup_can_be_thread_root(struct cgroup *cgrp) +{ + /* mixables don't care */ + if (cgroup_is_mixable(cgrp)) + return true; + + /* domain roots can't be nested under threaded */ + if (cgroup_is_threaded(cgrp)) + return false; + + /* can only have either domain or threaded children */ + if (cgrp->nr_populated_domain_children) + return false; + + /* and no domain controllers can be enabled */ + if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) + return false; + + return true; +} + +/* is @cgrp root of a threaded subtree? */ +static bool cgroup_is_thread_root(struct cgroup *cgrp) +{ + /* thread root should be a domain */ + if (cgroup_is_threaded(cgrp)) + return false; + + /* a domain w/ threaded children is a thread root */ + if (cgrp->nr_threaded_children) + return true; + + /* + * A domain which has tasks and explicit threaded controllers + * enabled is a thread root. + */ + if (cgroup_has_tasks(cgrp) && + (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) + return true; + + return false; +} + +/* a domain which isn't connected to the root w/o brekage can't be used */ +static bool cgroup_is_valid_domain(struct cgroup *cgrp) +{ + /* the cgroup itself can be a thread root */ + if (cgroup_is_threaded(cgrp)) + return false; + + /* but the ancestors can't be unless mixable */ + while ((cgrp = cgroup_parent(cgrp))) { + if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) + return false; + if (cgroup_is_threaded(cgrp)) + return false; + } + + return true; +} + /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; - if (parent) - return parent->subtree_control; + if (parent) { + u16 ss_mask = parent->subtree_control; + + /* threaded cgroups can only have threaded controllers */ + if (cgroup_is_threaded(cgrp)) + ss_mask &= cgrp_dfl_threaded_ss_mask; + return ss_mask; + } if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | @@ -355,8 +437,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - if (parent) - return parent->subtree_ss_mask; + if (parent) { + u16 ss_mask = parent->subtree_ss_mask; + + /* threaded cgroups can only have threaded controllers */ + if (cgroup_is_threaded(cgrp)) + ss_mask &= cgrp_dfl_threaded_ss_mask; + return ss_mask; + } return cgrp->root->subsys_mask; } @@ -2237,17 +2325,40 @@ out_release_tset: } /** - * cgroup_may_migrate_to - verify whether a cgroup can be migration destination + * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * - * On the default hierarchy, except for the root, subtree_control must be - * zero for migration destination cgroups with tasks so that child cgroups - * don't compete against tasks. + * On the default hierarchy, except for the mixable, (possible) thread root + * and threaded cgroups, subtree_control must be zero for migration + * destination cgroups with tasks so that child cgroups don't compete + * against tasks. */ -bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { - return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || - !dst_cgrp->subtree_control; + /* v1 doesn't have any restriction */ + if (!cgroup_on_dfl(dst_cgrp)) + return 0; + + /* verify @dst_cgrp can host resources */ + if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) + return -EOPNOTSUPP; + + /* mixables don't care */ + if (cgroup_is_mixable(dst_cgrp)) + return 0; + + /* + * If @dst_cgrp is already or can become a thread root or is + * threaded, it doesn't matter. + */ + if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) + return 0; + + /* apply no-internal-process constraint */ + if (dst_cgrp->subtree_control) + return -EBUSY; + + return 0; } /** @@ -2452,8 +2563,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, struct task_struct *task; int ret; - if (!cgroup_may_migrate_to(dst_cgrp)) - return -EBUSY; + ret = cgroup_migrate_vet_dst(dst_cgrp); + if (ret) + return ret; /* look up all src csets */ spin_lock_irq(&css_set_lock); @@ -2881,6 +2993,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) +{ + u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; + + /* if nothing is getting enabled, nothing to worry about */ + if (!enable) + return 0; + + /* can @cgrp host any resources? */ + if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) + return -EOPNOTSUPP; + + /* mixables don't care */ + if (cgroup_is_mixable(cgrp)) + return 0; + + if (domain_enable) { + /* can't enable domain controllers inside a thread subtree */ + if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) + return -EOPNOTSUPP; + } else { + /* + * Threaded controllers can handle internal competitions + * and are always allowed inside a (prospective) thread + * subtree. + */ + if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) + return 0; + } + + /* + * Controllers can't be enabled for a cgroup with tasks to avoid + * child cgroups competing against tasks. + */ + if (cgroup_has_tasks(cgrp)) + return -EBUSY; + + return 0; +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -2956,14 +3108,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - /* - * Except for the root, subtree_control must be zero for a cgroup - * with tasks so that child cgroups don't compete against tasks. - */ - if (enable && cgroup_parent(cgrp) && cgroup_has_tasks(cgrp)) { - ret = -EBUSY; + ret = cgroup_vet_subtree_control_enable(cgrp, enable); + if (ret) goto out_unlock; - } /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); @@ -2982,6 +3129,84 @@ out_unlock: return ret ?: nbytes; } +static int cgroup_enable_threaded(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup *dom_cgrp = parent->dom_cgrp; + int ret; + + lockdep_assert_held(&cgroup_mutex); + + /* noop if already threaded */ + if (cgroup_is_threaded(cgrp)) + return 0; + + /* we're joining the parent's domain, ensure its validity */ + if (!cgroup_is_valid_domain(dom_cgrp) || + !cgroup_can_be_thread_root(dom_cgrp)) + return -EOPNOTSUPP; + + /* + * Allow enabling thread mode only on empty cgroups to avoid + * implicit migrations and recursive operations. + */ + if (cgroup_has_tasks(cgrp) || css_has_online_children(&cgrp->self)) + return -EBUSY; + + /* + * The following shouldn't cause actual migrations and should + * always succeed. + */ + cgroup_save_control(cgrp); + + cgrp->dom_cgrp = dom_cgrp; + ret = cgroup_apply_control(cgrp); + if (!ret) + parent->nr_threaded_children++; + else + cgrp->dom_cgrp = cgrp; + + cgroup_finalize_control(cgrp, ret); + return ret; +} + +static int cgroup_type_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + if (cgroup_is_threaded(cgrp)) + seq_puts(seq, "threaded\n"); + else if (!cgroup_is_valid_domain(cgrp)) + seq_puts(seq, "domain invalid\n"); + else if (cgroup_is_thread_root(cgrp)) + seq_puts(seq, "domain threaded\n"); + else + seq_puts(seq, "domain\n"); + + return 0; +} + +static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + int ret; + + /* only switching to threaded mode is supported */ + if (strcmp(strstrip(buf), "threaded")) + return -EINVAL; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + /* threaded can only be enabled */ + ret = cgroup_enable_threaded(cgrp); + + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + static int cgroup_events_show(struct seq_file *seq, void *v) { seq_printf(seq, "populated %d\n", @@ -3867,12 +4092,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) return css_task_iter_next(it); } -static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) +static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, + unsigned int iter_flags) { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct css_task_iter *it = of->priv; - unsigned iter_flags = CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED; /* * When a seq_file is seeked, it's always traversed sequentially @@ -3895,6 +4120,23 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) return cgroup_procs_next(s, NULL, NULL); } +static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) +{ + struct cgroup *cgrp = seq_css(s)->cgroup; + + /* + * All processes of a threaded subtree belong to the domain cgroup + * of the subtree. Only threads can be distributed across the + * subtree. Reject reads on cgroup.procs in the subtree proper. + * They're always empty anyway. + */ + if (cgroup_is_threaded(cgrp)) + return ERR_PTR(-EOPNOTSUPP); + + return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | + CSS_TASK_ITER_THREADED); +} + static int cgroup_procs_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", task_pid_vnr(v)); @@ -3974,8 +4216,63 @@ out_unlock: return ret ?: nbytes; } +static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) +{ + return __cgroup_procs_start(s, pos, 0); +} + +static ssize_t cgroup_threads_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *src_cgrp, *dst_cgrp; + struct task_struct *task; + ssize_t ret; + + buf = strstrip(buf); + + dst_cgrp = cgroup_kn_lock_live(of->kn, false); + if (!dst_cgrp) + return -ENODEV; + + task = cgroup_procs_write_start(buf, false); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; + + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + /* thread migrations follow the cgroup.procs delegation rule */ + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb); + if (ret) + goto out_finish; + + /* and must be contained in the same domain */ + ret = -EOPNOTSUPP; + if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) + goto out_finish; + + ret = cgroup_attach_task(dst_cgrp, task, false); + +out_finish: + cgroup_procs_write_finish(task); +out_unlock: + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_base_files[] = { + { + .name = "cgroup.type", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_type_show, + .write = cgroup_type_write, + }, { .name = "cgroup.procs", .flags = CFTYPE_NS_DELEGATABLE, @@ -3986,6 +4283,14 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_procs_show, .write = cgroup_procs_write, }, + { + .name = "cgroup.threads", + .release = cgroup_procs_release, + .seq_start = cgroup_threads_start, + .seq_next = cgroup_procs_next, + .seq_show = cgroup_procs_show, + .write = cgroup_threads_write, + }, { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, @@ -4753,11 +5058,17 @@ int __init cgroup_init(void) cgrp_dfl_root.subsys_mask |= 1 << ss->id; + /* implicit controllers must be threaded too */ + WARN_ON(ss->implicit_on_dfl && !ss->threaded); + if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; + if (ss->threaded) + cgrp_dfl_threaded_ss_mask |= 1 << ss->id; + if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index dac46af22782..787a242fa69d 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -352,6 +352,7 @@ static int __init enable_cgroup_debug(char *str) { debug_cgrp_subsys.dfl_cftypes = debug_files; debug_cgrp_subsys.implicit_on_dfl = true; + debug_cgrp_subsys.threaded = true; return 1; } __setup("cgroup_debug", enable_cgroup_debug); diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 2237201d66d5..9829c67ebc0a 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = { .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, + .threaded = true, }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1538df9b2b65..ec78247da310 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11210,5 +11210,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = { * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, + .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ -- cgit v1.2.3 From 7a0cf0e74ab6cfd8e561f5f12860d4ff8844905a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 21 Jul 2017 11:14:51 -0400 Subject: cgroup: update debug controller to print out thread mode information Update debug controller so that it prints out debug info about thread mode. 1) The relationship between proc_cset and threaded_csets are displayed. 2) The status of being a thread root or threaded cgroup is displayed. This patch is extracted from Waiman's larger patch. v2: - Removed [thread root] / [threaded] from debug.cgroup_css_links file as the same information is available from cgroup.type. Suggested by Waiman. - Threaded marking is moved to the previous patch. Patch-originally-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 2 ++ kernel/cgroup/cgroup.c | 4 ++-- kernel/cgroup/debug.c | 52 ++++++++++++++++++++++++++++++----------- 3 files changed, 42 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index f10eb19ddf04..c167a40278e6 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -153,6 +153,8 @@ static inline void get_css_set(struct css_set *cset) bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); +bool cgroup_is_thread_root(struct cgroup *cgrp); +bool cgroup_is_threaded(struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c396e701c206..e9a377dc5bdb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -333,7 +333,7 @@ static bool cgroup_has_tasks(struct cgroup *cgrp) return cgrp->nr_populated_csets; } -static bool cgroup_is_threaded(struct cgroup *cgrp) +bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } @@ -372,7 +372,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) } /* is @cgrp root of a threaded subtree? */ -static bool cgroup_is_thread_root(struct cgroup *cgrp) +bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 787a242fa69d..f661b4cc5efd 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -114,27 +114,49 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) { struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - int dead_cnt = 0, extra_refs = 0; + int dead_cnt = 0, extra_refs = 0, threaded_csets = 0; spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; int refcnt = refcount_read(&cset->refcount); - seq_printf(seq, " %d", refcnt); - if (refcnt - cset->nr_tasks > 0) { - int extra = refcnt - cset->nr_tasks; - - seq_printf(seq, " +%d", extra); - /* - * Take out the one additional reference in - * init_css_set. - */ - if (cset == &init_css_set) - extra--; - extra_refs += extra; + /* + * Print out the proc_cset and threaded_cset relationship + * and highlight difference between refcount and task_count. + */ + seq_printf(seq, "css_set %pK", cset); + if (rcu_dereference_protected(cset->dom_cset, 1) != cset) { + threaded_csets++; + seq_printf(seq, "=>%pK", cset->dom_cset); + } + if (!list_empty(&cset->threaded_csets)) { + struct css_set *tcset; + int idx = 0; + + list_for_each_entry(tcset, &cset->threaded_csets, + threaded_csets_node) { + seq_puts(seq, idx ? "," : "<="); + seq_printf(seq, "%pK", tcset); + idx++; + } + } else { + seq_printf(seq, " %d", refcnt); + if (refcnt - cset->nr_tasks > 0) { + int extra = refcnt - cset->nr_tasks; + + seq_printf(seq, " +%d", extra); + /* + * Take out the one additional reference in + * init_css_set. + */ + if (cset == &init_css_set) + extra--; + extra_refs += extra; + } } seq_puts(seq, "\n"); @@ -163,10 +185,12 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) } spin_unlock_irq(&css_set_lock); - if (!dead_cnt && !extra_refs) + if (!dead_cnt && !extra_refs && !threaded_csets) return 0; seq_puts(seq, "\n"); + if (threaded_csets) + seq_printf(seq, "threaded css_sets = %d\n", threaded_csets); if (extra_refs) seq_printf(seq, "extra references = %d\n", extra_refs); if (dead_cnt) -- cgit v1.2.3 From db3e50f3234ba1a477413f56a9e5800a73dca786 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 21 Jul 2017 14:39:31 +0300 Subject: device property: Get rid of struct fwnode_handle type field Instead of relying on the struct fwnode_handle type field, define fwnode_operations structs for all separate types of fwnodes. To find out the type, compare to the ops field to relevant ops structs. This change has two benefits: 1. it avoids adding the type field to each and every instance of struct fwnode_handle, thus saving memory and 2. makes the ops field the single factor that defines both the types of the fwnode as well as defines the implementation of its operations, decreasing the possibility of bugs when developing code dealing with fwnode internals. Suggested-by: Rob Herring Signed-off-by: Sakari Ailus Reviewed-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- kernel/irq/irqdomain.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f1f251479aa6..e064fd1390f1 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -41,6 +41,8 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { } static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } #endif +const struct fwnode_operations irqchip_fwnode_ops; + /** * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain @@ -86,7 +88,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, fwid->type = type; fwid->name = n; fwid->data = data; - fwid->fwnode.type = FWNODE_IRQCHIP; + fwid->fwnode.ops = &irqchip_fwnode_ops; return &fwid->fwnode; } EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode); @@ -193,10 +195,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, } if (!domain->name) { - if (fwnode) { - pr_err("Invalid fwnode type (%d) for irqdomain\n", - fwnode->type); - } + if (fwnode) + pr_err("Invalid fwnode type for irqdomain\n"); domain->name = kasprintf(GFP_KERNEL, "unknown-%d", atomic_inc_return(&unknown_domains)); if (!domain->name) { -- cgit v1.2.3 From aa7519af450d3c62a057aece24877c34562fa25a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 19 Jul 2017 15:42:42 +0530 Subject: cpufreq: Use transition_delay_us for legacy governors as well The policy->transition_delay_us field is used only by the schedutil governor currently, and this field describes how fast the driver wants the cpufreq governor to change CPUs frequency. It should rather be a common thing across all governors, as it doesn't have any schedutil dependency here. Create a new helper cpufreq_policy_transition_delay_us() to get the transition delay across all governors. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 29a397067ffa..89c4dd9777bb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -528,16 +528,7 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - if (policy->transition_delay_us) { - tunables->rate_limit_us = policy->transition_delay_us; - } else { - unsigned int lat; - - tunables->rate_limit_us = LATENCY_MULTIPLIER; - lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (lat) - tunables->rate_limit_us *= lat; - } + tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); policy->governor_data = sg_policy; sg_policy->tunables = tunables; -- cgit v1.2.3 From bd8c9ba3b1e2037af5af4e48aea1087212838179 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 17 Jul 2017 17:19:25 -0700 Subject: PM / suspend: Export pm_suspend_target_state Have the core suspend/resume framework store the system-wide suspend state (suspend_state_t) we are about to enter, and expose it to drivers via pm_suspend_target_state in order to retrieve that. The state is assigned in suspend_devices_and_enter(). This is useful for platform specific drivers that may need to take a slightly different suspend/resume path based on the system's suspend/resume state being entered. Signed-off-by: Florian Fainelli Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 3ecf275d7e44..d0c0b96c2383 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -47,6 +47,8 @@ const char *mem_sleep_states[PM_SUSPEND_MAX]; suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM; +suspend_state_t pm_suspend_target_state; +EXPORT_SYMBOL_GPL(pm_suspend_target_state); unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -456,6 +458,8 @@ int suspend_devices_and_enter(suspend_state_t state) if (!sleep_state_supported(state)) return -ENOSYS; + pm_suspend_target_state = state; + error = platform_suspend_begin(state); if (error) goto Close; @@ -485,6 +489,7 @@ int suspend_devices_and_enter(suspend_state_t state) Close: platform_resume_end(state); + pm_suspend_target_state = PM_SUSPEND_ON; return error; Recover_platform: -- cgit v1.2.3 From 8d8b2441db9647890251538f60b75a4e45fdef8d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 19 Jul 2017 02:38:44 +0200 Subject: PM / sleep: Do not print debug messages by default Debug messages from the system suspend/hibernation infrastructure can fill up the entire kernel log buffer in some cases and anyway they are only useful for debugging. They depend on CONFIG_PM_DEBUG, but that is set as a rule as some generally useful diagnostic facilities depend on it too. For this reason, avoid printing those messages by default, but make it possible to turn them on as needed with the help of a new sysfs attribute under /sys/power/. Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 24 +++++++++++----------- kernel/power/main.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/power/suspend.c | 10 +++++----- 3 files changed, 69 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e1914c7b85b1..e19ee179d211 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -651,7 +651,7 @@ static int load_image_and_restore(void) int error; unsigned int flags; - pr_debug("Loading hibernation image.\n"); + pm_pr_dbg("Loading hibernation image.\n"); lock_device_hotplug(); error = create_basic_memory_bitmaps(); @@ -681,7 +681,7 @@ int hibernate(void) bool snapshot_test = false; if (!hibernation_available()) { - pr_debug("Hibernation not available.\n"); + pm_pr_dbg("Hibernation not available.\n"); return -EPERM; } @@ -727,7 +727,7 @@ int hibernate(void) else flags |= SF_CRC32_MODE; - pr_debug("Writing image.\n"); + pm_pr_dbg("Writing image.\n"); error = swsusp_write(flags); swsusp_free(); if (!error) { @@ -739,7 +739,7 @@ int hibernate(void) in_suspend = 0; pm_restore_gfp_mask(); } else { - pr_debug("Image restored successfully.\n"); + pm_pr_dbg("Image restored successfully.\n"); } Free_bitmaps: @@ -747,7 +747,7 @@ int hibernate(void) Thaw: unlock_device_hotplug(); if (snapshot_test) { - pr_debug("Checking hibernation image\n"); + pm_pr_dbg("Checking hibernation image\n"); error = swsusp_check(); if (!error) error = load_image_and_restore(); @@ -811,7 +811,7 @@ static int software_resume(void) goto Unlock; } - pr_debug("Checking hibernation image partition %s\n", resume_file); + pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); if (resume_delay) { pr_info("Waiting %dsec before reading resume device ...\n", @@ -853,10 +853,10 @@ static int software_resume(void) } Check_image: - pr_debug("Hibernation image partition %d:%d present\n", + pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - pr_debug("Looking for hibernation image.\n"); + pm_pr_dbg("Looking for hibernation image.\n"); error = swsusp_check(); if (error) goto Unlock; @@ -875,7 +875,7 @@ static int software_resume(void) goto Close_Finish; } - pr_debug("Preparing processes for restore.\n"); + pm_pr_dbg("Preparing processes for restore.\n"); error = freeze_processes(); if (error) goto Close_Finish; @@ -888,7 +888,7 @@ static int software_resume(void) /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&pm_mutex); - pr_debug("Hibernation image not present or could not be loaded.\n"); + pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: swsusp_close(FMODE_READ); @@ -1012,8 +1012,8 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, error = -EINVAL; if (!error) - pr_debug("Hibernation mode set to '%s'\n", - hibernation_modes[mode]); + pm_pr_dbg("Hibernation mode set to '%s'\n", + hibernation_modes[mode]); unlock_system_sleep(); return error ? error : n; } diff --git a/kernel/power/main.c b/kernel/power/main.c index 42bd800a6755..5ce00902c7e3 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -361,6 +361,57 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, power_attr_ro(pm_wakeup_irq); +static bool pm_debug_messages_on __read_mostly; + +static ssize_t pm_debug_messages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", pm_debug_messages_on); +} + +static ssize_t pm_debug_messages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_debug_messages_on = !!val; + return n; +} + +power_attr(pm_debug_messages); + +/** + * pm_pr_dbg - Print a suspend debug message to the kernel log. + * @fmt: Message format. + * + * The message will be emitted if enabled through the pm_debug_messages + * sysfs attribute. + */ +void pm_pr_dbg(const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (!pm_debug_messages_on) + return; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_DEBUG "PM: %pV", &vaf); + + va_end(args); +} + #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ @@ -697,6 +748,7 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP_DEBUG &pm_print_times_attr.attr, &pm_wakeup_irq_attr.attr, + &pm_debug_messages_attr.attr, #endif #endif #ifdef CONFIG_FREEZER diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d0c0b96c2383..4f10773322fa 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -106,7 +106,7 @@ static void freeze_enter(void) static void s2idle_loop(void) { - pr_debug("PM: suspend-to-idle\n"); + pm_pr_dbg("suspend-to-idle\n"); do { freeze_enter(); @@ -124,7 +124,7 @@ static void s2idle_loop(void) pm_wakeup_clear(false); } while (!dpm_suspend_noirq(PMSG_SUSPEND)); - pr_debug("PM: resume from suspend-to-idle\n"); + pm_pr_dbg("resume from suspend-to-idle\n"); } void freeze_wake(void) @@ -547,7 +547,7 @@ static int enter_state(suspend_state_t state) trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif - pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); + pm_pr_dbg("Preparing system for sleep (%s)\n", pm_states[state]); pm_suspend_clear_flags(); error = suspend_prepare(state); if (error) @@ -557,13 +557,13 @@ static int enter_state(suspend_state_t state) goto Finish; trace_suspend_resume(TPS("suspend_enter"), state, false); - pr_debug("PM: Suspending system (%s)\n", pm_states[state]); + pm_pr_dbg("Suspending system (%s)\n", pm_states[state]); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); Finish: - pr_debug("PM: Finishing wakeup.\n"); + pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: mutex_unlock(&pm_mutex); -- cgit v1.2.3 From 8915aa2042f85ecf86d4a202ef6d04bf06f05cca Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 20 Jul 2017 03:38:07 +0200 Subject: PM / sleep: Mark suspend/hibernation start and finish Regardless of whether or not debug messages from the core system suspend/hibernation code are enabled, it is useful to know when system-wide transitions start and finish (or fail), so print "info" messages at these points. Signed-off-by: Rafael J. Wysocki Acked-by: Mark Salyzyn --- kernel/power/hibernate.c | 5 +++++ kernel/power/suspend.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e19ee179d211..a5c36e9c56a6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -692,6 +692,7 @@ int hibernate(void) goto Unlock; } + pr_info("hibernation entry\n"); pm_prepare_console(); error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); if (error) { @@ -762,6 +763,8 @@ int hibernate(void) atomic_inc(&snapshot_device_available); Unlock: unlock_system_sleep(); + pr_info("hibernation exit\n"); + return error; } @@ -868,6 +871,7 @@ static int software_resume(void) goto Unlock; } + pr_info("resume from hibernation\n"); pm_prepare_console(); error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); if (error) { @@ -884,6 +888,7 @@ static int software_resume(void) Finish: __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); + pr_info("resume from hibernation failed (%d)\n", error); atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ Unlock: diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4f10773322fa..18cdb1596d27 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -584,6 +584,7 @@ int pm_suspend(suspend_state_t state) if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) return -EINVAL; + pr_info("PM: suspend entry (%s)\n", pm_states[state]); error = enter_state(state); if (error) { suspend_stats.fail++; @@ -591,6 +592,7 @@ int pm_suspend(suspend_state_t state) } else { suspend_stats.success++; } + pr_info("PM: suspend exit\n"); return error; } EXPORT_SYMBOL(pm_suspend); -- cgit v1.2.3 From cb08e0353c249a27aed10c6f60a13871ae449d33 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 23 Jul 2017 00:03:43 +0200 Subject: PM / timekeeping: Print debug messages when requested The messages printed by tk_debug_account_sleep_time() are basically useful for system sleep debugging, so print them only when the other debug messages from the core suspend/hibernate code are enabled. While at it, make it clear that the messages from tk_debug_account_sleep_time() are about timekeeping suspend duration, because in general timekeeping may be suspeded and resumed for multiple times during one system suspend-resume cycle. Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 10 +++++++--- kernel/time/timekeeping_debug.c | 5 +++-- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 5ce00902c7e3..b7876eaf83f3 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -388,13 +388,14 @@ static ssize_t pm_debug_messages_store(struct kobject *kobj, power_attr(pm_debug_messages); /** - * pm_pr_dbg - Print a suspend debug message to the kernel log. + * __pm_pr_dbg - Print a suspend debug message to the kernel log. + * @defer: Whether or not to use printk_deferred() to print the message. * @fmt: Message format. * * The message will be emitted if enabled through the pm_debug_messages * sysfs attribute. */ -void pm_pr_dbg(const char *fmt, ...) +void __pm_pr_dbg(bool defer, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -407,7 +408,10 @@ void pm_pr_dbg(const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - printk(KERN_DEBUG "PM: %pV", &vaf); + if (defer) + printk_deferred(KERN_DEBUG "PM: %pV", &vaf); + else + printk(KERN_DEBUG "PM: %pV", &vaf); va_end(args); } diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 38bc4d2208e8..0754cadfa9e6 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "timekeeping_internal.h" @@ -75,7 +76,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t) int bin = min(fls(t->tv_sec), NUM_BINS-1); sleep_time_bin[bin]++; - printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n", - (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); + pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n", + (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); } -- cgit v1.2.3 From 3c74541777302eec43a0d1327c4d58b8659a776b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:14:15 -0400 Subject: cgroup: fix error return value from cgroup_subtree_control() While refactoring, f7b2814bb9b6 ("cgroup: factor out cgroup_{apply|finalize}_control() from cgroup_subtree_control_write()") broke error return value from the function. The return value from the last operation is always overridden to zero. Fix it. Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 744975947d01..df2e0f14a95d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3001,11 +3001,11 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgrp->subtree_control &= ~disable; ret = cgroup_apply_control(cgrp); - cgroup_finalize_control(cgrp, ret); + if (ret) + goto out_unlock; kernfs_activate(cgrp->kn); - ret = 0; out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; -- cgit v1.2.3 From cc731525f26af85a1c3537da41e0abd1d35e0bdb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 16 Jul 2017 22:36:59 -0500 Subject: signal: Remove kernel interal si_code magic struct siginfo is a union and the kernel since 2.4 has been hiding a union tag in the high 16bits of si_code using the values: __SI_KILL __SI_TIMER __SI_POLL __SI_FAULT __SI_CHLD __SI_RT __SI_MESGQ __SI_SYS While this looks plausible on the surface, in practice this situation has not worked well. - Injected positive signals are not copied to user space properly unless they have these magic high bits set. - Injected positive signals are not reported properly by signalfd unless they have these magic high bits set. - These kernel internal values leaked to userspace via ptrace_peek_siginfo - It was possible to inject these kernel internal values and cause the the kernel to misbehave. - Kernel developers got confused and expected these kernel internal values in userspace in kernel self tests. - Kernel developers got confused and set si_code to __SI_FAULT which is SI_USER in userspace which causes userspace to think an ordinary user sent the signal and that it was not kernel generated. - The values make it impossible to reorganize the code to transform siginfo_copy_to_user into a plain copy_to_user. As si_code must be massaged before being passed to userspace. So remove these kernel internal si codes and make the kernel code simpler and more maintainable. To replace these kernel internal magic si_codes introduce the helper function siginfo_layout, that takes a signal number and an si_code and computes which union member of siginfo is being used. Have siginfo_layout return an enumeration so that gcc will have enough information to warn if a switch statement does not handle all of union members. A couple of architectures have a messed up ABI that defines signal specific duplications of SI_USER which causes more special cases in siginfo_layout than I would like. The good news is only problem architectures pay the cost. Update all of the code that used the previous magic __SI_ values to use the new SIL_ values and to call siginfo_layout to get those values. Escept where not all of the cases are handled remove the defaults in the switch statements so that if a new case is missed in the future the lack will show up at compile time. Modify the code that copies siginfo si_code to userspace to just copy the value and not cast si_code to a short first. The high bits are no longer used to hold a magic union member. Fixup the siginfo header files to stop including the __SI_ values in their constants and for the headers that were missing it to properly update the number of si_codes for each signal type. The fixes to copy_siginfo_from_user32 implementations has the interesting property that several of them perviously should never have worked as the __SI_ values they depended up where kernel internal. With that dependency gone those implementations should work much better. The idea of not passing the __SI_ values out to userspace and then not reinserting them has been tested with criu and criu worked without changes. Ref: 2.4.0-test1 Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 4 ++-- kernel/ptrace.c | 6 ++--- kernel/signal.c | 72 +++++++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 59 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..c8f23613df5b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1616,7 +1616,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); - unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); @@ -1742,7 +1742,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); - unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 60f356d91060..84b1367935e4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -728,8 +728,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); - if (copy_siginfo_to_user32(uinfo, &info) || - __put_user(info.si_code, &uinfo->si_code)) { + if (copy_siginfo_to_user32(uinfo, &info)) { ret = -EFAULT; break; } @@ -739,8 +738,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, { siginfo_t __user *uinfo = (siginfo_t __user *) data; - if (copy_siginfo_to_user(uinfo, &info) || - __put_user(info.si_code, &uinfo->si_code)) { + if (copy_siginfo_to_user(uinfo, &info)) { ret = -EFAULT; break; } diff --git a/kernel/signal.c b/kernel/signal.c index caed9133ae52..6bd53c8189f0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2682,6 +2682,51 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, } #endif +enum siginfo_layout siginfo_layout(int sig, int si_code) +{ + enum siginfo_layout layout = SIL_KILL; + if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { + static const struct { + unsigned char limit, layout; + } filter[] = { + [SIGILL] = { NSIGILL, SIL_FAULT }, + [SIGFPE] = { NSIGFPE, SIL_FAULT }, + [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, + [SIGBUS] = { NSIGBUS, SIL_FAULT }, + [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, +#if defined(SIGMET) && defined(NSIGEMT) + [SIGEMT] = { NSIGEMT, SIL_FAULT }, +#endif + [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, + [SIGPOLL] = { NSIGPOLL, SIL_POLL }, +#ifdef __ARCH_SIGSYS + [SIGSYS] = { NSIGSYS, SIL_SYS }, +#endif + }; + if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) + layout = filter[sig].layout; + else if (si_code <= NSIGPOLL) + layout = SIL_POLL; + } else { + if (si_code == SI_TIMER) + layout = SIL_TIMER; + else if (si_code == SI_SIGIO) + layout = SIL_POLL; + else if (si_code < 0) + layout = SIL_RT; + /* Tests to support buggy kernel ABIs */ +#ifdef TRAP_FIXME + if ((sig == SIGTRAP) && (si_code == TRAP_FIXME)) + layout = SIL_FAULT; +#endif +#ifdef FPE_FIXME + if ((sig == SIGFPE) && (si_code == FPE_FIXME)) + layout = SIL_FAULT; +#endif + } + return layout; +} + #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) @@ -2704,22 +2749,20 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); - switch (from->si_code & __SI_MASK) { - case __SI_KILL: + err |= __put_user(from->si_code, &to->si_code); + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_KILL: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); break; - case __SI_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_ptr, &to->si_ptr); + case SIL_TIMER: + /* Unreached SI_TIMER is negative */ break; - case __SI_POLL: + case SIL_POLL: err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - case __SI_FAULT: + case SIL_FAULT: err |= __put_user(from->si_addr, &to->si_addr); #ifdef __ARCH_SI_TRAPNO err |= __put_user(from->si_trapno, &to->si_trapno); @@ -2744,30 +2787,25 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_pkey, &to->si_pkey); #endif break; - case __SI_CHLD: + case SIL_CHLD: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_status, &to->si_status); err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); break; - case __SI_RT: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ: /* But this is */ + case SIL_RT: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_ptr, &to->si_ptr); break; #ifdef __ARCH_SIGSYS - case __SI_SYS: + case SIL_SYS: err |= __put_user(from->si_call_addr, &to->si_call_addr); err |= __put_user(from->si_syscall, &to->si_syscall); err |= __put_user(from->si_arch, &to->si_arch); break; #endif - default: /* this is just in case for now ... */ - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; } return err; } -- cgit v1.2.3 From 9305706c2e808ae59f1eb201867f82f1ddf6d7a6 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 21 Jul 2017 14:37:34 +0100 Subject: bpf/verifier: fix min/max handling in BPF_SUB We have to subtract the src max from the dst min, and vice-versa, since (e.g.) the smallest result comes from the largest subtrahend. Fixes: 484611357c19 ("bpf: allow access into map value arrays") Signed-off-by: Edward Cree Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index af9e84a4944e..664d93972373 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1865,10 +1865,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, * do our normal operations to the register, we need to set the values * to the min/max since they are undefined. */ - if (min_val == BPF_REGISTER_MIN_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (max_val == BPF_REGISTER_MAX_RANGE) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (opcode != BPF_SUB) { + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + } switch (opcode) { case BPF_ADD: @@ -1879,10 +1881,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg->min_align = min(src_align, dst_align); break; case BPF_SUB: + /* If one of our values was at the end of our ranges, then the + * _opposite_ value in the dst_reg goes to the end of our range. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value -= min_val; + dst_reg->min_value -= max_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value -= max_val; + dst_reg->max_value -= min_val; dst_reg->min_align = min(src_align, dst_align); break; case BPF_MUL: -- cgit v1.2.3 From 8e6bcd9f7eac47104ce6c5d82c554c9b244b38e1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Jul 2017 02:07:54 +0200 Subject: PM / s2idle: Rearrange the main suspend-to-idle loop As a preparation for subsequent changes, rearrange the core suspend-to-idle code by moving the initial invocation of dpm_suspend_noirq() into s2idle_loop(). This also causes debug messages from that code to appear in a less confusing order. Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 4 ++++ kernel/power/suspend.c | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 7fdc40d31b7d..6e3ac6a73d65 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -245,7 +245,11 @@ enum { #define TEST_FIRST TEST_NONE #define TEST_MAX (__TEST_AFTER_LAST - 1) +#ifdef CONFIG_PM_DEBUG extern int pm_test_level; +#else +#define pm_test_level (TEST_NONE) +#endif #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 18cdb1596d27..0c61713b6e5c 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -108,7 +108,13 @@ static void s2idle_loop(void) { pm_pr_dbg("suspend-to-idle\n"); - do { + while (!dpm_suspend_noirq(PMSG_SUSPEND)) { + /* + * Suspend-to-idle equals + * frozen processes + suspended devices + idle processors. + * Thus freeze_enter() should be called right after + * all devices have been suspended. + */ freeze_enter(); if (freeze_ops && freeze_ops->wake) @@ -122,7 +128,7 @@ static void s2idle_loop(void) break; pm_wakeup_clear(false); - } while (!dpm_suspend_noirq(PMSG_SUSPEND)); + } pm_pr_dbg("resume from suspend-to-idle\n"); } @@ -379,6 +385,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (error) goto Devices_early_resume; + if (state == PM_SUSPEND_FREEZE && pm_test_level != TEST_PLATFORM) { + s2idle_loop(); + goto Platform_early_resume; + } + error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { pr_err("PM: noirq suspend of devices failed\n"); @@ -391,17 +402,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_test(TEST_PLATFORM)) goto Platform_wake; - /* - * PM_SUSPEND_FREEZE equals - * frozen processes + suspended devices + idle processors. - * Thus we should invoke freeze_enter() soon after - * all the devices are suspended. - */ - if (state == PM_SUSPEND_FREEZE) { - s2idle_loop(); - goto Platform_early_resume; - } - error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; -- cgit v1.2.3 From 9a3ebe3523cc8297301d5d95332536ad123856bf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Jul 2017 02:12:10 +0200 Subject: PM / sleep: Check pm_wakeup_pending() in __device_suspend_noirq() Restore the pm_wakeup_pending() check in __device_suspend_noirq() removed by commit eed4d47efe95 (ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle) as that allows the function to return earlier if there's a wakeup event pending already (so that it may spend less time on carrying out operations that will be reversed shortly anyway) and rework the main suspend-to-idle loop to take that optimization into account. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0c61713b6e5c..5cf232795318 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -108,19 +108,32 @@ static void s2idle_loop(void) { pm_pr_dbg("suspend-to-idle\n"); - while (!dpm_suspend_noirq(PMSG_SUSPEND)) { + for (;;) { + int error; + + dpm_noirq_begin(); + /* * Suspend-to-idle equals * frozen processes + suspended devices + idle processors. * Thus freeze_enter() should be called right after * all devices have been suspended. */ - freeze_enter(); + error = dpm_noirq_suspend_devices(PMSG_SUSPEND); + if (!error) + freeze_enter(); + + dpm_noirq_resume_devices(PMSG_RESUME); + if (error && (error != -EBUSY || !pm_wakeup_pending())) { + dpm_noirq_end(); + break; + } if (freeze_ops && freeze_ops->wake) freeze_ops->wake(); - dpm_resume_noirq(PMSG_RESUME); + dpm_noirq_end(); + if (freeze_ops && freeze_ops->sync) freeze_ops->sync(); -- cgit v1.2.3 From e516a1db43cea397cf63d395534236d9869f0310 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Jul 2017 14:44:02 +0200 Subject: PM / sleep: Put pm_test under CONFIG_PM_SLEEP_DEBUG The pm_test sysfs attribute is under CONFIG_PM_DEBUG, but it doesn't make sense to provide it if CONFIG_PM_SLEEP is unset, so put it under CONFIG_PM_SLEEP_DEBUG instead. Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 8 +++----- kernel/power/power.h | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index b7876eaf83f3..3074ea4cec0a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -150,7 +150,7 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr power_attr(mem_sleep); #endif /* CONFIG_SUSPEND */ -#ifdef CONFIG_PM_DEBUG +#ifdef CONFIG_PM_SLEEP_DEBUG int pm_test_level = TEST_NONE; static const char * const pm_tests[__TEST_AFTER_LAST] = { @@ -211,7 +211,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, } power_attr(pm_test); -#endif /* CONFIG_PM_DEBUG */ +#endif /* CONFIG_PM_SLEEP_DEBUG */ #ifdef CONFIG_DEBUG_FS static char *suspend_step_name(enum suspend_stat_step step) @@ -746,10 +746,8 @@ static struct attribute * g[] = { &wake_lock_attr.attr, &wake_unlock_attr.attr, #endif -#ifdef CONFIG_PM_DEBUG - &pm_test_attr.attr, -#endif #ifdef CONFIG_PM_SLEEP_DEBUG + &pm_test_attr.attr, &pm_print_times_attr.attr, &pm_wakeup_irq_attr.attr, &pm_debug_messages_attr.attr, diff --git a/kernel/power/power.h b/kernel/power/power.h index 6e3ac6a73d65..268c1b0afc28 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -245,7 +245,7 @@ enum { #define TEST_FIRST TEST_NONE #define TEST_MAX (__TEST_AFTER_LAST - 1) -#ifdef CONFIG_PM_DEBUG +#ifdef CONFIG_PM_SLEEP_DEBUG extern int pm_test_level; #else #define pm_test_level (TEST_NONE) -- cgit v1.2.3 From bebcdae3ec13b2171ff91594787ab37b3fdb3306 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Jul 2017 14:49:51 +0200 Subject: PM / suspend: Use mem_sleep_labels[] strings in messages Some messages in suspend.c currently print state names from pm_states[], but that may be confusing if the mem_sleep sysfs attribute is changed to anything different from "mem", because in those cases the messages will say either "freeze" or "standby" after writing "mem" to /sys/power/state. To avoid the confusion, use mem_sleep_labels[] strings in those messages instead. Signed-off-by: Rafael J. Wysocki Reviewed-by: Andy Shevchenko --- kernel/power/suspend.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5cf232795318..a3d270e2e313 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -560,7 +560,7 @@ static int enter_state(suspend_state_t state) trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif - pm_pr_dbg("Preparing system for sleep (%s)\n", pm_states[state]); + pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); error = suspend_prepare(state); if (error) @@ -570,7 +570,7 @@ static int enter_state(suspend_state_t state) goto Finish; trace_suspend_resume(TPS("suspend_enter"), state, false); - pm_pr_dbg("Suspending system (%s)\n", pm_states[state]); + pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); @@ -597,7 +597,7 @@ int pm_suspend(suspend_state_t state) if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) return -EINVAL; - pr_info("PM: suspend entry (%s)\n", pm_states[state]); + pr_info("PM: suspend entry (%s)\n", mem_sleep_labels[state]); error = enter_state(state); if (error) { suspend_stats.fail++; -- cgit v1.2.3 From 142bce74fd141913b2127970a9bd90f33c5b7653 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Jul 2017 14:50:48 +0200 Subject: PM / suspend: Define pr_fmt() in suspend.c Define a common prefix ("PM:") for messages printed by the code in kernel/power/suspend.c. Signed-off-by: Rafael J. Wysocki Reviewed-by: Andy Shevchenko --- kernel/power/suspend.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index a3d270e2e313..4bce46ddc2cd 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -8,6 +8,8 @@ * This file is released under the GPLv2. */ +#define pr_fmt(fmt) "PM: " fmt + #include #include #include @@ -391,7 +393,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - pr_err("PM: late suspend of devices failed\n"); + pr_err("late suspend of devices failed\n"); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -405,7 +407,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { - pr_err("PM: noirq suspend of devices failed\n"); + pr_err("noirq suspend of devices failed\n"); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); @@ -481,7 +483,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { - pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); + pr_err("Some devices failed to suspend, or early wake event detected\n"); goto Recover_platform; } suspend_test_finish("suspend devices"); @@ -539,7 +541,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); + pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); return -EAGAIN; } #endif @@ -554,7 +556,7 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); - pr_info("PM: Syncing filesystems ... "); + pr_info("Syncing filesystems ... "); sys_sync(); pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); @@ -597,7 +599,7 @@ int pm_suspend(suspend_state_t state) if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) return -EINVAL; - pr_info("PM: suspend entry (%s)\n", mem_sleep_labels[state]); + pr_info("suspend entry (%s)\n", mem_sleep_labels[state]); error = enter_state(state); if (error) { suspend_stats.fail++; @@ -605,7 +607,7 @@ int pm_suspend(suspend_state_t state) } else { suspend_stats.success++; } - pr_info("PM: suspend exit\n"); + pr_info("suspend exit\n"); return error; } EXPORT_SYMBOL(pm_suspend); -- cgit v1.2.3 From 825c5bd2fd47d30148db15fc121216c483682b01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 26 May 2017 16:16:40 -0700 Subject: srcu: Move rcu_scheduler_starting() from Tiny RCU to Tiny SRCU Other than lockdep support, Tiny RCU has no need for the scheduler status. However, Tiny SRCU will need this to control boot-time behavior independent of lockdep. Therefore, this commit moves rcu_scheduler_starting() from kernel/rcu/tiny_plugin.h to kernel/rcu/srcutiny.c. This in turn allows the complete removal of kernel/rcu/tiny_plugin.h. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 8 ++++++++ kernel/rcu/tiny.c | 2 -- kernel/rcu/tiny_plugin.h | 47 ----------------------------------------------- 3 files changed, 8 insertions(+), 49 deletions(-) delete mode 100644 kernel/rcu/tiny_plugin.h (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 1a1c1047d2ed..76ac5f50b2c7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -33,6 +33,8 @@ #include "rcu_segcblist.h" #include "rcu.h" +int rcu_scheduler_active __read_mostly; + static int init_srcu_struct_fields(struct srcu_struct *sp) { sp->srcu_lock_nesting[0] = 0; @@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp) destroy_rcu_head_on_stack(&rs.head); } EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* Lockdep diagnostics. */ +void __init rcu_scheduler_starting(void) +{ + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; +} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f8488965250f..a64eee0db39e 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { .curtail = &rcu_bh_ctrlblk.rcucblist, }; -#include "tiny_plugin.h" - void rcu_barrier_bh(void) { wait_rcu_gp(call_rcu_bh); diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h deleted file mode 100644 index f0a01b2a3062..000000000000 --- a/kernel/rcu/tiny_plugin.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition - * Internal non-public definitions that provide either classic - * or preemptible semantics. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (c) 2010 Linaro - * - * Author: Paul E. McKenney - */ - -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) -#include - -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); - -/* - * During boot, we forgive RCU lockdep issues. After this function is - * invoked, we start taking RCU lockdep issues seriously. Note that unlike - * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE - * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. - * The reason for this is that Tiny RCU does not need kthreads, so does - * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process. Unless SRCU is in the mix. - */ -void __init rcu_scheduler_starting(void) -{ - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) - ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; -} - -#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ -- cgit v1.2.3 From 0d8a1e831e21d955af68f4ae5658b58b7ec25557 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Jun 2017 17:06:38 -0700 Subject: srcu: Make process_srcu() be static The function process_srcu() is not invoked outside of srcutree.c, so this commit makes it static and drops the EXPORT_SYMBOL_GPL(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d0ca524bf042..94bd6ed43ea3 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444); static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); +static void process_srcu(struct work_struct *work); /* * Initialize SRCU combining tree. Note that statically allocated @@ -1194,7 +1195,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* * This is the work-queue function that handles SRCU grace periods. */ -void process_srcu(struct work_struct *work) +static void process_srcu(struct work_struct *work) { struct srcu_struct *sp; @@ -1203,7 +1204,6 @@ void process_srcu(struct work_struct *work) srcu_advance_state(sp); srcu_reschedule(sp, srcu_get_delay(sp)); } -EXPORT_SYMBOL_GPL(process_srcu); void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, -- cgit v1.2.3 From 115a1a5285664f1931c30457081b4ae1e648f1f9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 May 2017 13:31:03 -0700 Subject: rcutorture: Move SRCU status printing to SRCU implementations This commit gets rid of some ugly #ifdefs in rcutorture.c by moving the SRCU status printing to the SRCU implementations. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 39 +-------------------------------------- kernel/rcu/srcutree.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b8f7f8ce8575..aedc8f2ad955 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -561,44 +561,7 @@ static void srcu_torture_barrier(void) static void srcu_torture_stats(void) { - int __maybe_unused cpu; - int idx; - -#ifdef CONFIG_TREE_SRCU - idx = srcu_ctlp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *counts; - - counts = per_cpu_ptr(srcu_ctlp->sda, cpu); - u0 = counts->srcu_unlock_count[!idx]; - u1 = counts->srcu_unlock_count[idx]; - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = counts->srcu_lock_count[!idx]; - l1 = counts->srcu_lock_count[idx]; - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld)", cpu, c0, c1); - } - pr_cont("\n"); -#elif defined(CONFIG_TINY_SRCU) - idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", - torture_type, TORTURE_FLAG, idx, - READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), - READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); -#endif + srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG); } static void srcu_torture_synchronize_expedited(void) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d0ca524bf042..8f6fd11c338a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1217,6 +1217,40 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) +{ + int cpu; + int idx; + + idx = sp->srcu_idx & 0x1; + pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *counts; + + counts = per_cpu_ptr(sp->sda, cpu); + u0 = counts->srcu_unlock_count[!idx]; + u1 = counts->srcu_unlock_count[idx]; + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = counts->srcu_lock_count[!idx]; + l1 = counts->srcu_lock_count[idx]; + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + } + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(srcu_torture_stats_print); + static int __init srcu_bootup_announce(void) { pr_info("Hierarchical SRCU implementation.\n"); -- cgit v1.2.3 From ac3748c60426602c091c8c9281c95fadb781fcc0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 May 2017 13:59:52 -0700 Subject: rcutorture: Print SRCU lock/unlock totals This commit adds printing of SRCU lock/unlock totals, which are just the sums of the per-CPU counts. Saves a bit of mental arithmetic. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8f6fd11c338a..b4f491b06ee0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1221,6 +1221,7 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) { int cpu; int idx; + unsigned long s0 = 0, s1 = 0; idx = sp->srcu_idx & 0x1; pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); @@ -1246,8 +1247,10 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) c0 = l0 - u0; c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + s0 += c0; + s1 += c1; } - pr_cont("\n"); + pr_cont(" T(%ld,%ld)\n", s0, s1); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); -- cgit v1.2.3 From f1dbc54b929c5917c57b6661c9e37e42868b26a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 May 2017 08:23:06 -0700 Subject: rcu: Remove CONFIG_TASKS_RCU ifdef from rcuperf.c The synchronize_rcu_tasks() and call_rcu_tasks() APIs are now available regardless of kernel configuration, so this commit removes the CONFIG_TASKS_RCU ifdef from rcuperf.c. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cc18110b612..1f87a02c3399 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks perf testing. */ @@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -#define RCUPERF_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUPERF_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * If performance tests complete, wait for shutdown to commence. */ @@ -658,7 +643,7 @@ rcu_perf_init(void) int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, - RCUPERF_TASKS_OPS + &tasks_ops, }; if (!torture_init_begin(perf_type, verbose, &perf_runnable)) -- cgit v1.2.3 From 5e741fa9e9698f4010bb85eff252186f7a4071f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Jun 2017 12:52:44 -0700 Subject: rcutorture: Enable SRCU readers from timer handler Now that it is legal to invoke srcu_read_lock() and srcu_read_unlock() for a given srcu_struct from both process context and {soft,}irq handlers, it is time to test it. This commit therefore enables testing of SRCU readers from rcutorture's timer handler, using in_task() to determine whether or not it is safe to sleep in the SRCU read-side critical sections. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index aedc8f2ad955..8d59c82bec0b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -522,7 +522,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp) delay = torture_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) + if (!delay && in_task()) schedule_timeout_interruptible(longdelay); else rcu_read_delay(rrsp); @@ -583,6 +583,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcu" }; @@ -615,6 +616,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcud" }; -- cgit v1.2.3 From b3c983142d4584c9d506b1ed31b65f4292b4aea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Jun 2017 16:39:00 -0700 Subject: rcutorture: Place event-traced strings into trace buffer Strings used in event tracing need to be specially handled, for example, being copied to the trace buffer instead of being pointed to by the trace buffer. Although the TPS() macro can be used to "launder" pointed-to strings, this might not be all that effective within a loadable module. This commit therefore copies rcutorture's strings to the trace buffer. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8d59c82bec0b..b48d2107f176 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -496,7 +496,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, - .name = "rcu_busted" + .name = "busted" }; /* -- cgit v1.2.3 From 808de39cf422aa08dffd29510d841848ea18e215 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jun 2017 10:03:22 -0700 Subject: rcutorture: Add task's CPU for rcutorture writer stalls It appears that at least some of the rcutorture writer stall messages coincide with unusually long CPU-online operations, for example, no fewer than 205 seconds in a recent test. It is of course possible that the writer stall is not unrelated to this unusually long CPU-hotplug operation, and so this commit adds the rcutorture writer task's CPU to the stall message to gain more information about this possible connection. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b48d2107f176..75ac749ced7c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1319,11 +1319,12 @@ rcu_torture_stats_print(void) srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gpnum, &completed); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags, - wtp == NULL ? ~0UL : wtp->state); + wtp == NULL ? ~0UL : wtp->state, + wtp == NULL ? -1 : (int)task_cpu(wtp)); show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } -- cgit v1.2.3 From a3b7b6c2739caf996b95a0164b5b4541c27630c3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jun 2017 16:07:17 -0700 Subject: rcutorture: Eliminate unused ts_rem local from rcu_trace_clock_local() This commit removes an unused local variable named ts_rem that is marked __maybe_unused. Yes, the variable was assigned to, but it was never used beyond that point, hence not needed. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 75ac749ced7c..6e3f644280ee 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); static u64 notrace rcu_trace_clock_local(void) { u64 ts = trace_clock_local(); - unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + + (void)do_div(ts, NSEC_PER_USEC); return ts; } #else /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3 From 96036c43066a04c99353abb4a342cc38a52d36f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Jul 2017 13:52:18 -0700 Subject: rcu: Add last-CPU to GP-kthread starvation messages This commit augments the grace-period-kthread starvation debugging messages by adding the last CPU that ran the kthread. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..48c6ab5ca164 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1358,12 +1358,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + rsp->gp_kthread ? rsp->gp_kthread->state : ~0, + rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); -- cgit v1.2.3 From f34c8585ed70f0f9b5ff9cf59c0dd533cddb975f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Jul 2017 15:27:32 -0700 Subject: rcutorture: Invoke call_rcu() from timer handler The Linux kernel invokes call_rcu() from various interrupt/softirq handlers, but rcutorture does not. This commit therefore adds this behavior to rcutorture's repertoire. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6e3f644280ee..0efd69b2fb8c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1080,6 +1080,11 @@ rcu_torture_fakewriter(void *arg) return 0; } +static void rcu_torture_timer_cb(struct rcu_head *rhp) +{ + kfree(rhp); +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1142,6 +1147,14 @@ static void rcu_torture_timer(unsigned long unused) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); + + /* Test call_rcu() invocation from interrupt handler. */ + if (cur_ops->call) { + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT); + + if (rhp) + cur_ops->call(rhp, rcu_torture_timer_cb); + } } /* -- cgit v1.2.3 From 241a974ba2c0d98e2104012cb80ed4494c0e66a7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 22 Jul 2017 10:40:04 +0300 Subject: bpf: dev_map_alloc() shouldn't return NULL We forgot to set the error code on two error paths which means that we return ERR_PTR(0) which is NULL. The caller, find_and_alloc_map(), is not expecting that and will have a NULL dereference. Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") Signed-off-by: Dan Carpenter Acked-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 899364d097f5..d439ee0eadb1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -114,6 +114,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + err = -ENOMEM; /* A per cpu bitfield with a bit per possible net device */ dtab->flush_needed = __alloc_percpu( BITS_TO_LONGS(attr->max_entries) * -- cgit v1.2.3 From bf50f0e8a03005d19de66d01261d855cdeedf572 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 24 Jul 2017 13:56:28 -0600 Subject: sched/core: Fix some documentation build warnings The kerneldoc comments for try_to_wake_up_local() were out of date, leading to these documentation build warnings: ./kernel/sched/core.c:2080: warning: No description found for parameter 'rf' ./kernel/sched/core.c:2080: warning: Excess function parameter 'cookie' description in 'try_to_wake_up_local' Update the comment to reflect current reality and give us some peace and quiet. Signed-off-by: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20170724135628.695cecfc@lwn.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b427b4..0869b20fba81 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2069,7 +2069,7 @@ out: /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened - * @cookie: context's cookie for pinning + * @rf: request-queue flags for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not -- cgit v1.2.3 From 5279631271b32201243c60308a8987bd585e4460 Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Fri, 7 Jul 2017 11:15:58 +0800 Subject: module: fix ddebug_remove_module() ddebug_remove_module() use mod->name to find the ddebug_table of the module and remove it. But dynamic_debug_setup() use the first _ddebug->modname to create ddebug_table for the module. It's ok when the _ddebug->modname is the same with the mod->name. But livepatch module is special, it may contain _ddebugs of other modules, the modname of which is different from the name of livepatch module. So ddebug_remove_module() can't use mod->name to find the right ddebug_table and remove it. It can cause kernel crash when we cat the file /dynamic_debug/control. Signed-off-by: Zhou Chengming Signed-off-by: Jessica Yu --- kernel/module.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 40f983cbea81..de66ec825992 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2707,21 +2707,21 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) } #endif /* CONFIG_KALLSYMS */ -static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) +static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) { if (!debug) return; #ifdef CONFIG_DYNAMIC_DEBUG - if (ddebug_add_module(debug, num, debug->modname)) + if (ddebug_add_module(debug, num, mod->name)) pr_err("dynamic debug error adding module: %s\n", debug->modname); #endif } -static void dynamic_debug_remove(struct _ddebug *debug) +static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) { if (debug) - ddebug_remove_module(debug->modname); + ddebug_remove_module(mod->name); } void * __weak module_alloc(unsigned long size) @@ -3715,7 +3715,7 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_arch_cleanup; } - dynamic_debug_setup(info->debug, info->num_debug); + dynamic_debug_setup(mod, info->debug, info->num_debug); /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ ftrace_module_init(mod); @@ -3779,7 +3779,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_disable_nx(mod); ddebug_cleanup: - dynamic_debug_remove(info->debug); + dynamic_debug_remove(mod, info->debug); synchronize_sched(); kfree(mod->args); free_arch_cleanup: -- cgit v1.2.3 From 8be6e1b15c54402106e6ba9bc706e685458b2d2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 29 Apr 2017 20:03:20 -0700 Subject: rcu: Use timer as backstop for NOCB deferred wakeups The handling of RCU's no-CBs CPUs has a maintenance headache, namely that if call_rcu() is invoked with interrupts disabled, the rcuo kthread wakeup must be defered to a point where we can be sure that scheduler locks are not held. Of course, there are a lot of code paths leading from an interrupts-disabled invocation of call_rcu(), and missing any one of these can result in excessive callback-invocation latency, and potentially even system hangs. This commit therefore uses a timer to guarantee that the wakeup will eventually occur. If one of the deferred-wakeup points kicks in, then the timer is simply cancelled. This commit also fixes up an incomplete removal of commits that were intended to plug remaining exit paths, which should have the added benefit of reducing the overhead of RCU's context-switch hooks. In addition, it simplifies leader-to-follower callback-list handoff by introducing locking. The call_rcu()-to-leader handoff continues to use atomic operations in order to maintain good real-time latency for common-case use of call_rcu(). Signed-off-by: Paul E. McKenney [ paulmck: Dan Carpenter fix for mod_timer() usage bug found by smatch. ] --- kernel/rcu/tree.h | 2 + kernel/rcu/tree_plugin.h | 179 +++++++++++++++++++++++++++++------------------ 2 files changed, 111 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9af0f31d6847..fe83f684ddcd 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -268,7 +268,9 @@ struct rcu_data { struct rcu_head **nocb_follower_tail; struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; + raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + struct timer_list nocb_timer; /* Enforce finite deferral. */ /* The following fields are used by the leader, hence own cacheline. */ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..bb9e6e43130f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1788,22 +1788,61 @@ bool rcu_is_nocb_cpu(int cpu) } /* - * Kick the leader kthread for this NOCB group. + * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock + * and this function releases it. */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void __wake_nocb_leader(struct rcu_data *rdp, bool force, + unsigned long flags) + __releases(rdp->nocb_lock) { struct rcu_data *rdp_leader = rdp->nocb_leader; - if (!READ_ONCE(rdp_leader->nocb_kthread)) + lockdep_assert_held(&rdp->nocb_lock); + if (!READ_ONCE(rdp_leader->nocb_kthread)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; - if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { + } + if (rdp_leader->nocb_leader_sleep || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + del_timer(&rdp->nocb_timer); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); + } else { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } } +/* + * Kick the leader kthread for this NOCB group, but caller has not + * acquired locks. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + __wake_nocb_leader(rdp, force, flags); +} + +/* + * Arrange to wake the leader kthread for this NOCB group at some + * future time when it is safe to do so. + */ +static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) + mod_timer(&rdp->nocb_timer, jiffies + 1); + WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +} + /* * Does the specified CPU need an RCU callback for the specified flavor * of rcu_barrier()? @@ -1891,11 +1930,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeEmptyIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { @@ -1905,11 +1941,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeOvfIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { @@ -2031,6 +2064,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) static void nocb_leader_wait(struct rcu_data *my_rdp) { bool firsttime = true; + unsigned long flags; bool gotcbs; struct rcu_data *rdp; struct rcu_head **tail; @@ -2042,7 +2076,11 @@ wait_again: trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); - /* Memory barrier handled by smp_mb() calls below and repoll. */ + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); + my_rdp->nocb_leader_sleep = true; + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); @@ -2054,7 +2092,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; - smp_mb(); /* wakeup before ->nocb_head reads. */ + smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2066,56 +2104,41 @@ wait_again: gotcbs = true; } - /* - * If there were no callbacks, sleep a bit, rescan after a - * memory barrier, and go retry. - */ + /* No callbacks? Sleep a bit if polling, and go retry. */ if (unlikely(!gotcbs)) { - if (!rcu_nocb_poll) + WARN_ON(signal_pending(current)); + if (rcu_nocb_poll) { + schedule_timeout_interruptible(1); + } else { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "WokeEmpty"); - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); - - /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) - if (READ_ONCE(rdp->nocb_head)) { - /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_sleep = false; - break; - } + } goto wait_again; } /* Wait for one grace period. */ rcu_nocb_wait_gp(my_rdp); - /* - * We left ->nocb_leader_sleep unset to reduce cache thrashing. - * We set it now, but recheck for new callbacks while - * traversing our follower list. - */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ - /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (READ_ONCE(rdp->nocb_head)) + if (!rcu_nocb_poll && + READ_ONCE(rdp->nocb_head) && + READ_ONCE(my_rdp->nocb_leader_sleep)) { + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); + } if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ /* Append callbacks to follower's "done" list. */ - tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = rdp->nocb_gp_tail; *tail = rdp->nocb_gp_head; - smp_mb__after_atomic(); /* Store *tail before wakeup. */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { - /* - * List was empty, wake up the follower. - * Memory barriers supplied by atomic_long_add(). - */ + /* List was empty, so wake up the follower. */ swake_up(&rdp->nocb_wq); } } @@ -2131,28 +2154,16 @@ wait_again: */ static void nocb_follower_wait(struct rcu_data *rdp) { - bool firsttime = true; - for (;;) { - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "FollowerSleep"); - swait_event_interruptible(rdp->nocb_wq, - READ_ONCE(rdp->nocb_follower_head)); - } else if (firsttime) { - /* Don't drown trace log with "Poll"! */ - firsttime = false; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); - } + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); + swait_event_interruptible(rdp->nocb_wq, + READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ return; } - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "WokeEmpty"); WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeEmpty"); } } @@ -2165,6 +2176,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) static int rcu_nocb_kthread(void *arg) { int c, cl; + unsigned long flags; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2179,11 +2191,14 @@ static int rcu_nocb_kthread(void *arg) nocb_follower_wait(rdp); /* Pull the ready-to-invoke callbacks onto local list. */ - list = READ_ONCE(rdp->nocb_follower_head); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + list = rdp->nocb_follower_head; + rdp->nocb_follower_head = NULL; + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); - WRITE_ONCE(rdp->nocb_follower_head, NULL); - tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, @@ -2226,18 +2241,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { + unsigned long flags; int ndw; - if (!rcu_nocb_need_deferred_wakeup(rdp)) + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_nocb_need_deferred_wakeup(rdp)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; + } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); + __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(unsigned long x) +{ + do_nocb_deferred_wakeup_common((struct rcu_data *)x); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (rcu_nocb_need_deferred_wakeup(rdp)) + do_nocb_deferred_wakeup_common(rdp); +} + void __init rcu_init_nohz(void) { int cpu; @@ -2287,6 +2323,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) rdp->nocb_tail = &rdp->nocb_head; init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_lock_init(&rdp->nocb_lock); + setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, + (unsigned long)rdp); } /* -- cgit v1.2.3 From f274f1e72d7171c80c8c790040e47a23a74796b6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 30 Jun 2017 13:13:59 -0700 Subject: task_work: Replace spin_unlock_wait() with lock/unlock pair There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore replaces the spin_unlock_wait() call in task_work_run() with a spin_lock_irq() and a spin_unlock_irq() aruond the cmpxchg() dequeue loop. This should be safe from a performance perspective because ->pi_lock is local to the task and because calls to the other side of the race, task_work_cancel(), should be rare. Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- kernel/task_work.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index d513051fcca2..836a72a66fba 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -96,20 +96,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + raw_spin_lock_irq(&task->pi_lock); do { work = READ_ONCE(task->task_works); head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; } while (cmpxchg(&task->task_works, work, head) != work); + raw_spin_unlock_irq(&task->pi_lock); if (!work) break; - /* - * Synchronize with task_work_cancel(). It can't remove - * the first entry == work, cmpxchg(task_works) should - * fail, but it can play with *work and other entries. - */ - raw_spin_unlock_wait(&task->pi_lock); do { next = work->next; -- cgit v1.2.3 From 918a8c2c4ea4fab8b7855b8da48bbaf0a733ebb0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:18:26 -0400 Subject: cgroup: remove unnecessary empty check when enabling threaded mode cgroup_enable_threaded() checks that the cgroup doesn't have any tasks or children and fails the operation if so. This test is unnecessary because the first part is already checked by cgroup_can_be_thread_root() and the latter is unnecessary. The latter actually cause a behavioral oddity. Please consider the following hierarchy. All cgroups are domains. A / \ B C \ D If B is made threaded, C and D becomes invalid domains. Due to the no children restriction, threaded mode can't be enabled on C. For C and D, the only thing the user can do is removal. There is no reason for this restriction. Remove it. Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e9a377dc5bdb..e0a558c4d358 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3146,13 +3146,6 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) !cgroup_can_be_thread_root(dom_cgrp)) return -EOPNOTSUPP; - /* - * Allow enabling thread mode only on empty cgroups to avoid - * implicit migrations and recursive operations. - */ - if (cgroup_has_tasks(cgrp) || css_has_online_children(&cgrp->self)) - return -EBUSY; - /* * The following shouldn't cause actual migrations and should * always succeed. -- cgit v1.2.3 From c705a00d77457b44ba3790fdf0627ecb8593a254 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jul 2017 13:20:18 -0400 Subject: cgroup: add comment to cgroup_enable_threaded() Explain cgroup_enable_threaded() and note that the function can never be called on the root cgroup. Signed-off-by: Tejun Heo Suggested-by: Waiman Long --- kernel/cgroup/cgroup.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e0a558c4d358..85f6a112344b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3129,6 +3129,15 @@ out_unlock: return ret ?: nbytes; } +/** + * cgroup_enable_threaded - make @cgrp threaded + * @cgrp: the target cgroup + * + * Called when "threaded" is written to the cgroup.type interface file and + * tries to make @cgrp threaded and join the parent's resource domain. + * This function is never called on the root cgroup as cgroup.type doesn't + * exist on it. + */ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); -- cgit v1.2.3 From 0a94efb5acbb6980d7c9ab604372d93cd507e4d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:36:15 -0400 Subject: workqueue: implicit ordered attribute should be overridable 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") automatically enabled ordered attribute for unbound workqueues w/ max_active == 1. Because ordered workqueues reject max_active and some attribute changes, this implicit ordered mode broke cases where the user creates an unbound workqueue w/ max_active == 1 and later explicitly changes the related attributes. This patch distinguishes explicit and implicit ordered setting and overrides from attribute changes if implict. Signed-off-by: Tejun Heo Fixes: 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") --- kernel/workqueue.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index abe4a4971c24..7146ea70a62d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3744,8 +3744,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ - if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) - return -EINVAL; + if (!list_empty(&wq->pwqs)) { + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + return -EINVAL; + + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, attrs); if (!ctx) @@ -4129,13 +4133,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) struct pool_workqueue *pwq; /* disallow meddling with max_active for ordered workqueues */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); + wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; for_each_pwq(pwq, wq) @@ -5263,7 +5268,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); -- cgit v1.2.3 From a58163d8ca2c8d288ee9f95989712f98473a5ac2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Jun 2017 12:11:34 -0700 Subject: rcu: Migrate callbacks earlier in the CPU-offline timeline RCU callbacks must be migrated away from an outgoing CPU, and this is done near the end of the CPU-hotplug operation, after the outgoing CPU is long gone. Unfortunately, this means that other CPU-hotplug callbacks can execute while the outgoing CPU's callbacks are still immobilized on the long-gone CPU's callback lists. If any of these CPU-hotplug callbacks must wait, either directly or indirectly, for the invocation of any of the immobilized RCU callbacks, the system will hang. This commit avoids such hangs by migrating the callbacks away from the outgoing CPU immediately upon its departure, shortly after the return from __cpu_die() in takedown_cpu(). Thus, RCU is able to advance these callbacks and invoke them, which allows all the after-the-fact CPU-hotplug callbacks to wait on these RCU callbacks without risk of a hang. While in the neighborhood, this commit also moves rcu_send_cbs_to_orphanage() and rcu_adopt_orphan_cbs() under a pre-existing #ifdef to avoid including dead code on the one hand and to avoid define-without-use warnings on the other hand. Reported-by: Jeffrey Hugo Link: http://lkml.kernel.org/r/db9c91f6-1b17-6136-84f0-03c3c2581ab4@codeaurora.org Signed-off-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Ingo Molnar Cc: Anna-Maria Gleixner Cc: Boris Ostrovsky Cc: Richard Weinberger --- kernel/cpu.c | 1 + kernel/rcu/tree.c | 209 +++++++++++++++++++++++++++++------------------------- 2 files changed, 114 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index eee033134262..bfbd649ccdc8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu) __cpu_die(cpu); tick_cleanup_dead_cpu(cpu); + rcutree_migrate_callbacks(cpu); return 0; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..9bb5dff50815 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2562,85 +2562,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) rcu_report_qs_rdp(rdp->cpu, rsp, rdp); } -/* - * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline, and the caller must hold the - * ->orphan_lock. - */ -static void -rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, - struct rcu_node *rnp, struct rcu_data *rdp) -{ - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs do not have orphanable callbacks. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) - return; - - /* - * Orphan the callbacks. First adjust the counts. This is safe - * because _rcu_barrier() excludes CPU-hotplug operations, so it - * cannot be running now. Thus no memory barrier is required. - */ - rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); - rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); - - /* - * Next, move those callbacks still needing a grace period to - * the orphanage, where some other CPU will pick them up. - * Some of the callbacks might have gone partway through a grace - * period, but that is too bad. They get to start over because we - * cannot assume that grace periods are synchronized across CPUs. - */ - rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - - /* - * Then move the ready-to-invoke callbacks to the orphanage, - * where some other CPU will pick them up. These will not be - * required to pass though another grace period: They are done. - */ - rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - - /* Finally, disallow further callbacks on this CPU. */ - rcu_segcblist_disable(&rdp->cblist); -} - -/* - * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->orphan_lock. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) -{ - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs are handled specially. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) - return; - - /* Do the accounting first. */ - rdp->n_cbs_adopted += rsp->orphan_done.len; - if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) - rcu_idle_count_callbacks_posted(); - rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); - - /* - * We do not need a memory barrier here because the only way we - * can get here if there is an rcu_barrier() in flight is if - * we are the task doing the rcu_barrier(). - */ - - /* First adopt the ready-to-invoke callbacks, then the done ones. */ - rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); - WARN_ON_ONCE(rsp->orphan_done.head); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - WARN_ON_ONCE(rsp->orphan_pend.head); - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != - !rcu_segcblist_n_cbs(&rdp->cblist)); -} - /* * Trace the fact that this CPU is going offline. */ @@ -2704,14 +2625,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) /* * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup, - * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them. There can only be one CPU hotplug operation at a time, - * so no other CPU can be attempting to update rcu_cpu_kthread_task. + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2720,18 +2639,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - - /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); - rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp, flags); - raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - - WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || - !rcu_segcblist_empty(&rdp->cblist), - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", - cpu, rcu_segcblist_n_cbs(&rdp->cblist), - rcu_segcblist_first_cb(&rdp->cblist)); } /* @@ -3937,6 +3844,116 @@ void rcu_report_dead(unsigned int cpu) for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); } + +/* + * Send the specified CPU's RCU callbacks to the orphanage. The + * specified CPU must be offline, and the caller must hold the + * ->orphan_lock. + */ +static void +rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, + struct rcu_node *rnp, struct rcu_data *rdp) +{ + lockdep_assert_held(&rsp->orphan_lock); + + /* No-CBs CPUs do not have orphanable callbacks. */ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) + return; + + /* + * Orphan the callbacks. First adjust the counts. This is safe + * because _rcu_barrier() excludes CPU-hotplug operations, so it + * cannot be running now. Thus no memory barrier is required. + */ + rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); + rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); + + /* + * Next, move those callbacks still needing a grace period to + * the orphanage, where some other CPU will pick them up. + * Some of the callbacks might have gone partway through a grace + * period, but that is too bad. They get to start over because we + * cannot assume that grace periods are synchronized across CPUs. + */ + rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); + + /* + * Then move the ready-to-invoke callbacks to the orphanage, + * where some other CPU will pick them up. These will not be + * required to pass though another grace period: They are done. + */ + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); + + /* Finally, disallow further callbacks on this CPU. */ + rcu_segcblist_disable(&rdp->cblist); +} + +/* + * Adopt the RCU callbacks from the specified rcu_state structure's + * orphanage. The caller must hold the ->orphan_lock. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) +{ + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + + lockdep_assert_held(&rsp->orphan_lock); + + /* No-CBs CPUs are handled specially. */ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) + return; + + /* Do the accounting first. */ + rdp->n_cbs_adopted += rsp->orphan_done.len; + if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) + rcu_idle_count_callbacks_posted(); + rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); + + /* + * We do not need a memory barrier here because the only way we + * can get here if there is an rcu_barrier() in flight is if + * we are the task doing the rcu_barrier(). + */ + + /* First adopt the ready-to-invoke callbacks, then the done ones. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); + WARN_ON_ONCE(rsp->orphan_done.head); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); + WARN_ON_ONCE(rsp->orphan_pend.head); + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != + !rcu_segcblist_n_cbs(&rdp->cblist)); +} + +/* Orphan the dead CPU's callbacks, and then adopt them. */ +static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); + rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); + rcu_adopt_orphan_cbs(rsp, flags); + raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); + WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || + !rcu_segcblist_empty(&rdp->cblist), + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", + cpu, rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_first_cb(&rdp->cblist)); +} + +/* + * The outgoing CPU has just passed through the dying-idle state, + * and we are being invoked from the CPU that was IPIed to continue the + * offline operation. We need to migrate the outgoing CPU's callbacks. + */ +void rcutree_migrate_callbacks(int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_migrate_callbacks(cpu, rsp); +} #endif /* -- cgit v1.2.3 From 313517fc44fb2d8403654b2d3e511da7d1c78cd6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Jun 2017 16:55:40 -0700 Subject: rcu: Make expedited GPs correctly handle hardware CPU insertion The update of the ->expmaskinitnext and of ->ncpus are unsynchronized, with the value of ->ncpus being incremented long before the corresponding ->expmaskinitnext mask is updated. If an RCU expedited grace period sees ->ncpus change, it will update the ->expmaskinit masks from the new ->expmaskinitnext masks. But it is possible that ->ncpus has already been updated, but the ->expmaskinitnext masks still have their old values. For the current expedited grace period, no harm done. The CPU could not have been online before the grace period started, so there is no need to wait for its non-existent pre-existing readers. But the next RCU expedited grace period is in a world of hurt. The value of ->ncpus has already been updated, so this grace period will assume that the ->expmaskinitnext masks have not changed. But they have, and they won't be taken into account until the next never-been-online CPU comes online. This means that RCU will be ignoring some CPUs that it should be paying attention to. The solution is to update ->ncpus and ->expmaskinitnext while holding the ->lock for the rcu_node structure containing the ->expmaskinitnext mask. Because smp_store_release() is now used to update ->ncpus and smp_load_acquire() is now used to locklessly read it, if the expedited grace period sees ->ncpus change, then the updating CPU has to already be holding the corresponding ->lock. Therefore, when the expedited grace period later acquires that ->lock, it is guaranteed to see the new value of ->expmaskinitnext. On the other hand, if the expedited grace period loads ->ncpus just before an update, earlier full memory barriers guarantee that the incoming CPU isn't far enough along to be running any RCU readers. This commit therefore makes the required change. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++-- kernel/rcu/tree_exp.h | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9bb5dff50815..f431114bc06a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3684,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - if (!rdp->beenonline) - WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; @@ -3789,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; + int nbits; + unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_state *rsp; @@ -3799,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->qsmaskinitnext |= mask; + oldmask = rnp->expmaskinitnext; rnp->expmaskinitnext |= mask; + oldmask ^= rnp->expmaskinitnext; + nbits = bitmap_weight(&oldmask, BITS_PER_LONG); + /* Allow lockless access for expedited grace periods. */ + smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dd21ca47e4b4..46d61b597731 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) unsigned long flags; unsigned long mask; unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); + int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */ struct rcu_node *rnp; struct rcu_node *rnp_up; -- cgit v1.2.3 From a2b2df207acff1e3f965ff2c7c38255b06d583cb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Jun 2017 15:38:26 -0700 Subject: torture: Fix typo suppressing CPU-hotplug statistics The torture status line contains a series of values preceded by "onoff:". The last value in that line, the one preceding the "HZ=" string, is always zero. The reason that it is always zero is that torture_offline() was incrementing the sum_offl pointer instead of the value that this pointer referenced. This commit therefore makes this increment operate on the statistic rather than the pointer to the statistic. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 55de96529287..637e172835d8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, torture_type, cpu); (*n_offl_successes)++; delta = jiffies - starttime; - sum_offl += delta; + *sum_offl += delta; if (*min_offl < 0) { *min_offl = delta; *max_offl = delta; -- cgit v1.2.3 From c47e067a3c57835fe5ce24d50482f5c325a64efd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jun 2017 20:30:08 -0700 Subject: rcu: Remove orphan/adopt event-tracing fields The rcu_node structure's ->n_cbs_orphaned and ->n_cbs_adopted fields are updated, but never read. This commit therefore removes them. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- kernel/rcu/tree.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f431114bc06a..f5acf34247fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3871,7 +3871,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * because _rcu_barrier() excludes CPU-hotplug operations, so it * cannot be running now. Thus no memory barrier is required. */ - rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); /* @@ -3910,7 +3909,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) return; /* Do the accounting first. */ - rdp->n_cbs_adopted += rsp->orphan_done.len; if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9af0f31d6847..aec53c4d4aec 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -219,8 +219,6 @@ struct rcu_data { /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ - unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ - unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ -- cgit v1.2.3 From 95335c0355834c16cc11f041a981ee6782dba2e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 10:49:50 -0700 Subject: rcu: Check for NOCB CPUs and empty lists earlier in CB migration The current CPU-hotplug RCU-callback-migration code checks for the source (newly offlined) CPU being a NOCBs CPU down in rcu_send_cbs_to_orphanage(). This commit simplifies callback migration a bit by moving this check up to rcu_migrate_callbacks(). This commit also adds a check for the source CPU having no callbacks, which eases analysis of the rcu_send_cbs_to_orphanage() and rcu_adopt_orphan_cbs() functions. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f5acf34247fb..aeea697d6f9f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3862,10 +3862,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, { lockdep_assert_held(&rsp->orphan_lock); - /* No-CBs CPUs do not have orphanable callbacks. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) - return; - /* * Orphan the callbacks. First adjust the counts. This is safe * because _rcu_barrier() excludes CPU-hotplug operations, so it @@ -3935,6 +3931,9 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) + return; /* No callbacks to migrate. */ + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); -- cgit v1.2.3 From b1a2d79fe7d210c114003362d93d529912d244df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 12:23:46 -0700 Subject: rcu: Make NOCB CPUs migrate CBs directly from outgoing CPU RCU's CPU-hotplug callback-migration code first moves the outgoing CPU's callbacks to ->orphan_done and ->orphan_pend, and only then moves them to the NOCB callback list. This commit avoids the extra step (and simplifies the code) by moving the callbacks directly from the outgoing CPU's callback list to the NOCB callback list. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 ++++++++------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 31 ++++++++++--------------------- 3 files changed, 19 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aeea697d6f9f..4ea28e820f4a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3899,11 +3899,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) lockdep_assert_held(&rsp->orphan_lock); - /* No-CBs CPUs are handled specially. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) - return; - /* Do the accounting first. */ if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); @@ -3928,13 +3923,20 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) { unsigned long flags; + struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); + local_irq_save(flags); + my_rdp = this_cpu_ptr(rsp->rda); + if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { + local_irq_restore(flags); + return; + } + raw_spin_lock(&rsp->orphan_lock); /* irqs already disabled. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index aec53c4d4aec..574513cf49b4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -493,7 +493,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..ff7d5ee49816 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1961,30 +1961,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is * not a no-CBs CPU. */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->orphan_done.len; - long qll = rsp->orphan_done.len_lazy; - - /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); if (!rcu_is_nocb_cpu(smp_processor_id())) - return false; - - /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_done.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), - rcu_cblist_tail(&rsp->orphan_done), - ql, qll, flags); - } - if (rsp->orphan_pend.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), - rcu_cblist_tail(&rsp->orphan_pend), - ql, qll, flags); - } - rcu_cblist_init(&rsp->orphan_done); - rcu_cblist_init(&rsp->orphan_pend); + return false; /* Not NOCBs CPU, caller must migrate CBs. */ + __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), + rcu_segcblist_tail(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_disable(&rdp->cblist); return true; } @@ -2459,7 +2448,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, return false; } -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { -- cgit v1.2.3 From 9fa46fb8c9c6dfad30487fb3d905c2ff04b379b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 15:43:27 -0700 Subject: rcu: Advance outgoing CPU's callbacks before migrating them It is possible that the outgoing CPU is unaware of recent grace periods, and so it is also possible that some of its pending callbacks are actually ready to be invoked. The current callback-migration code would needlessly force these callbacks to pass through another grace period. This commit therefore invokes rcu_advance_cbs() on the outgoing CPU's callbacks in order to give them full credit for having passed through any recent grace periods. This also fixes an odd theoretical bug where there are no callbacks in the system except for those on the outgoing CPU, none of those callbacks have yet been associated with a grace-period number, there is never again another callback registered, and the surviving CPU never again takes a scheduling-clock interrupt, never goes idle, and never enters nohz_full userspace execution. Yes, this is (just barely) possible. It requires that the surviving CPU be a nohz_full CPU, that its scheduler-clock interrupt be shut off, and that it loop forever in the kernel. You get bonus points if you can make this one happen! ;-) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4ea28e820f4a..c080c6ed66af 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3926,6 +3926,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -3936,7 +3937,11 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) local_irq_restore(flags); return; } - raw_spin_lock(&rsp->orphan_lock); /* irqs already disabled. */ + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ + raw_spin_unlock_rcu_node(rnp_root); + + raw_spin_lock(&rsp->orphan_lock); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); -- cgit v1.2.3 From 537b85c870babacc1cf13235e92bee9de86210e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 17:59:02 -0700 Subject: rcu: Eliminate rcu_state ->orphan_lock The ->orphan_lock is acquired and released only within the rcu_migrate_callbacks() function, which now acquires the root rcu_node structure's ->lock. This commit therefore eliminates the ->orphan_lock in favor of the root rcu_node structure's ->lock. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 +++------------ kernel/rcu/tree.h | 3 --- 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c080c6ed66af..58ab489eca66 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,7 +97,6 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ - .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ @@ -3853,15 +3852,12 @@ void rcu_report_dead(unsigned int cpu) /* * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline, and the caller must hold the - * ->orphan_lock. + * specified CPU must be offline. */ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rsp->orphan_lock); - /* * Orphan the callbacks. First adjust the counts. This is safe * because _rcu_barrier() excludes CPU-hotplug operations, so it @@ -3891,14 +3887,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, /* * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->orphan_lock. + * orphanage. */ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - lockdep_assert_held(&rsp->orphan_lock); - /* Do the accounting first. */ if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); @@ -3939,12 +3933,9 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - raw_spin_unlock_rcu_node(rnp_root); - - raw_spin_lock(&rsp->orphan_lock); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); - raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 574513cf49b4..62b1d0b0d47c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -348,14 +348,11 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; - /* Protect following fields. */ struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ /* need a grace period. */ struct rcu_cblist orphan_done; /* Orphaned callbacks that */ /* are ready to invoke. */ /* (Contains counts.) */ - /* End of fields guarded by orphan_lock. */ struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ -- cgit v1.2.3 From 21cc248384aeb0375b3cac164c276c78c503291a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 20:37:51 -0700 Subject: rcu: Advance callbacks after migration When migrating callbacks from a newly offlined CPU, we are already holding the root rcu_node structure's lock, so it costs almost nothing to advance and accelerate the newly migrated callbacks. This patch therefore makes this advancing and acceleration happen. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 58ab489eca66..f9f01aeb5add 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3935,6 +3935,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); + rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), -- cgit v1.2.3 From f2dbe4a562d4f17cc1bad3e36a9d1ccb19c86604 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jun 2017 07:44:06 -0700 Subject: rcu: Localize rcu_state ->orphan_pend and ->orphan_done Given that the rcu_state structure's >orphan_pend and ->orphan_done fields are used only during migration of callbacks from the recently offlined CPU to a surviving CPU, if rcu_send_cbs_to_orphanage() and rcu_adopt_orphan_cbs() are combined, these fields can become local variables in the combined function. This commit therefore combines rcu_send_cbs_to_orphanage() and rcu_adopt_orphan_cbs() into a new rcu_segcblist_merge() function and removes the ->orphan_pend and ->orphan_done fields. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 24 +++++++++++++++ kernel/rcu/rcu_segcblist.h | 2 ++ kernel/rcu/tree.c | 73 +++------------------------------------------- kernel/rcu/tree.h | 6 ---- 4 files changed, 30 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 2b62a38b080f..7091d824b893 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -503,3 +503,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, return true; return false; } + +/* + * Merge the source rcu_segcblist structure into the destination + * rcu_segcblist structure, then initialize the source. Any pending + * callbacks from the source get to start over. It is best to + * advance and accelerate both the destination and the source + * before merging. + */ +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp) +{ + struct rcu_cblist donecbs; + struct rcu_cblist pendcbs; + + rcu_cblist_init(&donecbs); + rcu_cblist_init(&pendcbs); + rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); + rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); + rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); +} diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 6e36e36478cd..c2f319f3f06a 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -162,3 +162,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, unsigned long seq); +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9f01aeb5add..d330c17c8df4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,8 +97,6 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ - .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ - .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -3850,76 +3848,12 @@ void rcu_report_dead(unsigned int cpu) rcu_cleanup_dying_idle_cpu(cpu, rsp); } -/* - * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline. - */ -static void -rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, - struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* - * Orphan the callbacks. First adjust the counts. This is safe - * because _rcu_barrier() excludes CPU-hotplug operations, so it - * cannot be running now. Thus no memory barrier is required. - */ - rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); - - /* - * Next, move those callbacks still needing a grace period to - * the orphanage, where some other CPU will pick them up. - * Some of the callbacks might have gone partway through a grace - * period, but that is too bad. They get to start over because we - * cannot assume that grace periods are synchronized across CPUs. - */ - rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - - /* - * Then move the ready-to-invoke callbacks to the orphanage, - * where some other CPU will pick them up. These will not be - * required to pass though another grace period: They are done. - */ - rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - - /* Finally, disallow further callbacks on this CPU. */ - rcu_segcblist_disable(&rdp->cblist); -} - -/* - * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) -{ - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - - /* Do the accounting first. */ - if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) - rcu_idle_count_callbacks_posted(); - rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); - - /* - * We do not need a memory barrier here because the only way we - * can get here if there is an rcu_barrier() in flight is if - * we are the task doing the rcu_barrier(). - */ - - /* First adopt the ready-to-invoke callbacks, then the done ones. */ - rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); - WARN_ON_ONCE(rsp->orphan_done.head); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - WARN_ON_ONCE(rsp->orphan_pend.head); - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != - !rcu_segcblist_n_cbs(&rdp->cblist)); -} - -/* Orphan the dead CPU's callbacks, and then adopt them. */ +/* Migrate the dead CPU's callbacks to the current CPU. */ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) { unsigned long flags; struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) @@ -3933,15 +3867,16 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp, flags); rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != + !rcu_segcblist_n_cbs(&my_rdp->cblist)); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 62b1d0b0d47c..b99f31c2b0c3 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -348,12 +348,6 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ - /* need a grace period. */ - struct rcu_cblist orphan_done; /* Orphaned callbacks that */ - /* are ready to invoke. */ - /* (Contains counts.) */ - struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ -- cgit v1.2.3 From aed4e046863820e6d06ebf7c079e9ad924608edf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jun 2017 08:38:45 -0700 Subject: rcu: Remove unused RCU list functions Given changes to callback migration, rcu_cblist_head(), rcu_cblist_tail(), rcu_cblist_count_cbs(), rcu_segcblist_segempty(), rcu_segcblist_dequeued_lazy(), and rcu_segcblist_new_cbs() are no longer used. This commit therefore removes them. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 84 ---------------------------------------------- kernel/rcu/rcu_segcblist.h | 26 -------------- 2 files changed, 110 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7091d824b893..7649fcd2c4c7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -35,24 +35,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->len_lazy = 0; } -/* - * Debug function to actually count the number of callbacks. - * If the number exceeds the limit specified, return -1. - */ -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) -{ - int cnt = 0; - struct rcu_head **rhpp = &rclp->head; - - for (;;) { - if (!*rhpp) - return cnt; - if (++cnt > lim) - return -1; - rhpp = &(*rhpp)->next; - } -} - /* * Dequeue the oldest rcu_head structure from the specified callback * list. This function assumes that the callback is non-lazy, but @@ -102,17 +84,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) rsclp->tails[RCU_NEXT_TAIL] = NULL; } -/* - * Is the specified segment of the specified rcu_segcblist structure - * empty of callbacks? - */ -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) -{ - if (seg == RCU_DONE_TAIL) - return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; - return rsclp->tails[seg - 1] == rsclp->tails[seg]; -} - /* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? @@ -133,50 +104,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); } -/* - * Dequeue and return the first ready-to-invoke callback. If there - * are no ready-to-invoke callbacks, return NULL. Disables interrupts - * to avoid interference. Does not protect from interference from other - * CPUs or tasks. - */ -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - int i; - struct rcu_head *rhp; - - local_irq_save(flags); - if (!rcu_segcblist_ready_cbs(rsclp)) { - local_irq_restore(flags); - return NULL; - } - rhp = rsclp->head; - BUG_ON(!rhp); - rsclp->head = rhp->next; - for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { - if (rsclp->tails[i] != &rhp->next) - break; - rsclp->tails[i] = &rsclp->head; - } - smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ - WRITE_ONCE(rsclp->len, rsclp->len - 1); - local_irq_restore(flags); - return rhp; -} - -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - - local_irq_save(flags); - rsclp->len_lazy--; - local_irq_restore(flags); -} - /* * Return a pointer to the first callback in the specified rcu_segcblist * structure. This is useful for diagnostics. @@ -202,17 +129,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) return NULL; } -/* - * Does the specified rcu_segcblist structure contain callbacks that - * have not yet been processed beyond having been posted, that is, - * does it contain callbacks in its last segment? - */ -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_is_enabled(rsclp) && - !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); -} - /* * Enqueue the specified callback onto the specified rcu_segcblist * structure, updating accounting as needed. Note that the ->len diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index c2f319f3f06a..581c12b63544 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) rclp->len_lazy--; } -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) -{ - return rclp->head; -} - -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) -{ - WARN_ON_ONCE(!rclp->head); - return rclp->tail; -} - void rcu_cblist_init(struct rcu_cblist *rclp); -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); /* @@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp, bool lazy); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, -- cgit v1.2.3 From 09efeeee173e9f541b15157d30658cd8b23ec4f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Jul 2017 10:56:46 -0700 Subject: rcu: Move callback-list warning to irq-disable region After adopting callbacks from a newly offlined CPU, the adopting CPU checks to make sure that its callback list's count is zero only if the list has no callbacks and vice versa. Unfortunately, it does so after enabling interrupts, which means that false positives are possible due to interrupt handlers invoking call_rcu(). Although these false positives are improbable, rcutorture did make it happen once. This commit therefore moves this check to an irq-disabled region of code, thus suppressing the false positive. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d330c17c8df4..4b03bddbca3c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3869,14 +3869,14 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != + !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); - WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != - !rcu_segcblist_n_cbs(&my_rdp->cblist)); } /* -- cgit v1.2.3 From 560c6e452d8fa1e98cc50674d3408924387a983e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 19 Jul 2017 15:42:47 +0530 Subject: cpufreq: schedutil: Set dynamic_switching to true Set dynamic_switching to 'true' to disallow use of schedutil governor for platforms with transition_latency set to CPUFREQ_ETERNAL, as they may not want to do automatic dynamic frequency switching. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 89c4dd9777bb..45fcf21ad685 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -646,6 +646,7 @@ static void sugov_limits(struct cpufreq_policy *policy) static struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, + .dynamic_switching = true, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, -- cgit v1.2.3 From b6eb66fd3430a1a0d1c89cf4bdb01062bdb9b738 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Jul 2017 02:19:35 +0200 Subject: device property: export irqchip_fwnode_ops The newly added irqchip_fwnode_ops structure is not exported, which can lead to link errors: ERROR: "irqchip_fwnode_ops" [drivers/gpio/gpio-xgene-sb.ko] undefined! I checked that all other such symbols that were introduced are exported if they need to be, this is the only missing one. Fixes: db3e50f3234b (device property: Get rid of struct fwnode_handle type field) Signed-off-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e064fd1390f1..9e8a075117a3 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -42,6 +42,7 @@ static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } #endif const struct fwnode_operations irqchip_fwnode_ops; +EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); /** * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for -- cgit v1.2.3 From a5a0809bc58e133d674e45175b052c9bdf002f1d Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 23 Jul 2017 08:54:25 -0700 Subject: cpufreq: schedutil: Make iowait boost more energy efficient Currently the iowait_boost feature in schedutil makes the frequency go to max on iowait wakeups. This feature was added to handle a case that Peter described where the throughput of operations involving continuous I/O requests [1] is reduced due to running at a lower frequency, however the lower throughput itself causes utilization to be low and hence causing frequency to be low hence its "stuck". Instead of going to max, its also possible to achieve the same effect by ramping up to max if there are repeated in_iowait wakeups happening. This patch is an attempt to do that. We start from a lower frequency (policy->min) and double the boost for every consecutive iowait update until we reach the maximum iowait boost frequency (iowait_boost_max). I ran a synthetic test (continuous O_DIRECT writes in a loop) on an x86 machine with intel_pstate in passive mode using schedutil. In this test the iowait_boost value ramped from 800MHz to 4GHz in 60ms. The patch achieves the desired improved throughput as the existing behavior. [1] https://patchwork.kernel.org/patch/9735885/ Suggested-by: Peter Zijlstra Suggested-by: Viresh Kumar Signed-off-by: Joel Fernandes Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 29a397067ffa..148844a995a8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -53,6 +53,7 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; + bool iowait_boost_pending; unsigned long iowait_boost; unsigned long iowait_boost_max; u64 last_update; @@ -169,30 +170,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { if (flags & SCHED_CPUFREQ_IOWAIT) { - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + if (sg_cpu->iowait_boost_pending) + return; + + sg_cpu->iowait_boost_pending = true; + + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else { + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + } } else if (sg_cpu->iowait_boost) { s64 delta_ns = time - sg_cpu->last_update; /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) + if (delta_ns > TICK_NSEC) { sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_pending = false; + } } } static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, unsigned long *max) { - unsigned long boost_util = sg_cpu->iowait_boost; - unsigned long boost_max = sg_cpu->iowait_boost_max; + unsigned long boost_util, boost_max; - if (!boost_util) + if (!sg_cpu->iowait_boost) return; + if (sg_cpu->iowait_boost_pending) { + sg_cpu->iowait_boost_pending = false; + } else { + sg_cpu->iowait_boost >>= 1; + if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + sg_cpu->iowait_boost = 0; + return; + } + } + + boost_util = sg_cpu->iowait_boost; + boost_max = sg_cpu->iowait_boost_max; + if (*util * boost_max < *max * boost_util) { *util = boost_util; *max = boost_max; } - sg_cpu->iowait_boost >>= 1; } #ifdef CONFIG_NO_HZ_COMMON @@ -264,6 +289,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; + j_sg_cpu->iowait_boost_pending = false; continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) -- cgit v1.2.3 From 251accf98591d7f59f7a2bac2e05c66d16bf2811 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 23 Jul 2017 08:54:26 -0700 Subject: cpufreq: schedutil: Use unsigned int for iowait boost Make iowait_boost and iowait_boost_max as unsigned int since its unit is kHz and this is consistent with struct cpufreq_policy. Also change the local variables in sugov_iowait_boost() to match this. Signed-off-by: Joel Fernandes Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 148844a995a8..ddd385f2a985 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -54,8 +54,8 @@ struct sugov_cpu { struct sugov_policy *sg_policy; bool iowait_boost_pending; - unsigned long iowait_boost; - unsigned long iowait_boost_max; + unsigned int iowait_boost; + unsigned int iowait_boost_max; u64 last_update; /* The fields below are only needed when sharing a policy. */ @@ -196,7 +196,7 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, unsigned long *max) { - unsigned long boost_util, boost_max; + unsigned int boost_util, boost_max; if (!sg_cpu->iowait_boost) return; -- cgit v1.2.3 From aec47caa747e1e15a3363dc0b0db37e51b8d4f15 Mon Sep 17 00:00:00 2001 From: Pierre Kuo Date: Tue, 11 Jul 2017 14:40:55 +0800 Subject: printk: Modify operators of printed_len and text_len With commit ("printk: report lost messages in printk safe/nmi contexts") and commit <8b1742c9c207> ("printk: remove zap_locks() function"), it seems we can remove initialization, "=0", of text_len and directly assign result of log_output to printed_len. Link: http://lkml.kernel.org/r/1499755255-6258-1-git-send-email-vichy.kuo@gmail.com Cc: rostedt@goodmis.org Cc: linux-kernel@vger.kernel.org Cc: joe@perches.com Signed-off-by: Pierre Kuo Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fc47863f629c..229fbdcbe6ef 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1698,10 +1698,10 @@ asmlinkage int vprintk_emit(int facility, int level, { static char textbuf[LOG_LINE_MAX]; char *text = textbuf; - size_t text_len = 0; + size_t text_len; enum log_flags lflags = 0; unsigned long flags; - int printed_len = 0; + int printed_len; bool in_sched = false; if (level == LOGLEVEL_SCHED) { @@ -1754,7 +1754,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); + printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len); logbuf_unlock_irqrestore(flags); -- cgit v1.2.3 From 2b1be689f3aadcfe0a768314c80e43483c784659 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Fri, 14 Jul 2017 14:51:12 +0200 Subject: printk/console: Always disable boot consoles that use init memory before it is freed Commit 4c30c6f566c0 ("kernel/printk: do not turn off bootconsole in printk_late_init() if keep_bootcon") added a check on keep_bootcon to ensure that boot consoles were kept around until the real console is registered. This can lead to problems if the boot console data and code are in the init section, since it can be freed before the boot console is unregistered. Commit 81cc26f2bd11 ("printk: only unregister boot consoles when necessary") fixed this a better way. It allowed to keep boot consoles that did not use init data. Unfortunately it did not remove the check of keep_bootcon. This can lead to crashes and weird panics when the bootconsole is accessed after free, especially if page poisoning is in use and the code / data have been overwritten with a poison value. To prevent this, always free the boot console if it is within the init section. In addition, print a warning about that the console is removed prematurely. Finally there is a new comment how to avoid the warning. It replaced an explanation that duplicated a more comprehensive function description few lines above. Fixes: 4c30c6f566c0 ("kernel/printk: do not turn off bootconsole in printk_late_init() if keep_bootcon") Link: http://lkml.kernel.org/r/1500036673-7122-2-git-send-email-pmladek@suse.com Cc: Steven Rostedt Cc: Andrew Morton Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: "David S. Miller" Cc: Alan Cox Cc: "Fabio M. Di Nitto" Cc: linux-serial@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Matt Redfearn [pmladek@suse.com: print the warning, code and comments clean up] Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 229fbdcbe6ef..76985ee3dfff 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2650,9 +2650,8 @@ void __init console_init(void) * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory - * intersects with the init section. Note that code exists elsewhere to get - * rid of the boot console as soon as the proper console shows up, so there - * won't be side-effects from postponing the removal. + * intersects with the init section. Note that all other boot consoles will + * get unregistred when the real preferred console is registered. */ static int __init printk_late_init(void) { @@ -2660,16 +2659,15 @@ static int __init printk_late_init(void) int ret; for_each_console(con) { - if (!keep_bootcon && con->flags & CON_BOOT) { + if ((con->flags & CON_BOOT) && + init_section_intersects(con, sizeof(*con))) { /* - * Make sure to unregister boot consoles whose data - * resides in the init section before the init section - * is discarded. Boot consoles whose data will stick - * around will automatically be unregistered when the - * proper console replaces them. + * Please, consider moving the reported consoles out + * of the init section. */ - if (init_section_intersects(con, sizeof(*con))) - unregister_console(con); + pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", + con->name, con->index); + unregister_console(con); } } ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, -- cgit v1.2.3 From 5a814231ae3d4f248a8ecb668a072a1da471c656 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 14 Jul 2017 14:51:13 +0200 Subject: printk/console: Enhance the check for consoles using init memory printk_late_init() is responsible for disabling boot consoles that use init memory. It checks the address of struct console for this. But this is not enough. For example, there are several early consoles that have write() method in the init section and struct console in the normal section. They are not disabled and could cause fancy and hard to debug system states. It is even more complicated by the macros EARLYCON_DECLARE() and OF_EARLYCON_DECLARE() where various struct members are set at runtime by the provided setup() function. I have tried to reproduce this problem and forced the classic uart early console to stay using keep_bootcon parameter. In particular I used earlycon=uart,io,0x3f8 keep_bootcon console=ttyS0,115200. The system did not boot: [ 1.570496] PM: Image not found (code -22) [ 1.570496] PM: Image not found (code -22) [ 1.571886] PM: Hibernation image not present or could not be loaded. [ 1.571886] PM: Hibernation image not present or could not be loaded. [ 1.576407] Freeing unused kernel memory: 2528K [ 1.577244] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) The double lines are caused by having both early uart console and ttyS0 console enabled at the same time. The early console stopped working when the init memory was freed. Fortunately, the invalid call was caught by the NX-protexted page check and did not cause any silent fancy problems. This patch adds a check for many other addresses stored in struct console. It omits setup() and match() that are used only when the console is registered. Therefore they have already been used at this point and there is no reason to use them again. Link: http://lkml.kernel.org/r/1500036673-7122-3-git-send-email-pmladek@suse.com Cc: Steven Rostedt Cc: Andrew Morton Cc: Peter Zijlstra Cc: Matt Redfearn Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: "David S. Miller" Cc: Alan Cox Cc: "Fabio M. Di Nitto" Cc: linux-serial@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 76985ee3dfff..87f1a8f4e0f9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2659,8 +2659,16 @@ static int __init printk_late_init(void) int ret; for_each_console(con) { - if ((con->flags & CON_BOOT) && - init_section_intersects(con, sizeof(*con))) { + if (!(con->flags & CON_BOOT)) + continue; + + /* Check addresses that might be used for enabled consoles. */ + if (init_section_intersects(con, sizeof(*con)) || + init_section_contains(con->write, 0) || + init_section_contains(con->read, 0) || + init_section_contains(con->device, 0) || + init_section_contains(con->unblank, 0) || + init_section_contains(con->data, 0)) { /* * Please, consider moving the reported consoles out * of the init section. -- cgit v1.2.3 From 8397913303abc9333f376a518a8368fa22ca5e6e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 27 Jul 2017 12:21:11 +0200 Subject: genirq/cpuhotplug: Revert "Set force affinity flag on hotplug migration" That commit was part of the changes moving x86 to the generic CPU hotplug interrupt migration code. The force flag was required on x86 before the hierarchical irqdomain rework, but invoking set_affinity() with force=true stayed and had no side effects. At some point in the past, the force flag got repurposed to support the exynos timer interrupt affinity setting to a not yet online CPU, so the interrupt controller callback does not verify the supplied affinity mask against cpu_online_mask. Setting the flag in the CPU hotplug code causes the cpu online masking to be blocked on these irq controllers and results in potentially affining an interrupt to the CPU which is unplugged, i.e. instead of moving it away, it's just reassigned to it. As the force flags is not longer needed on x86, it's safe to revert that patch so the ARM irqchips which use the force flag work again. Add comments to that effect, so this won't happen again. Note: The online mask handling should be done in the generic code and the force flag and the masking in the irq chips removed all together, but that's not a change possible for 4.13. Fixes: 77f85e66aa8b ("genirq/cpuhotplug: Set force affinity flag on hotplug migration") Reported-by: Will Deacon Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Cc: Marc Zyngier Cc: Russell King Cc: LAK Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707271217590.3109@nanos Signed-off-by: Thomas Gleixner --- kernel/irq/cpuhotplug.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index aee8f7ec40af..638eb9c83d9f 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc) affinity = cpu_online_mask; brokeaff = true; } - - err = irq_do_set_affinity(d, affinity, true); + /* + * Do not set the force argument of irq_do_set_affinity() as this + * disables the masking of offline CPUs from the supplied affinity + * mask and therefore might keep/reassign the irq to the outgoing + * CPU. + */ + err = irq_do_set_affinity(d, affinity, false); if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); -- cgit v1.2.3 From 35732cf9dd38b1efb0f2f22c91c61b51337d1ac3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Jul 2017 13:30:21 -0700 Subject: srcu: Provide ordering for CPU not involved in grace period Tree RCU guarantees that every online CPU has a memory barrier between any given grace period and any of that CPU's RCU read-side sections that must be ordered against that grace period. Since RCU doesn't always know where read-side critical sections are, the actual implementation guarantees order against prior and subsequent non-idle non-offline code, whether in an RCU read-side critical section or not. As a result, there does not need to be a memory barrier at the end of synchronize_rcu() and friends because the ordering internal to the grace period has ordered every CPU's post-grace-period execution against each CPU's pre-grace-period execution, again for all non-idle online CPUs. In contrast, SRCU can have non-idle online CPUs that are completely uninvolved in a given SRCU grace period, for example, a CPU that never runs any SRCU read-side critical sections and took no part in the grace-period processing. It is in theory possible for a given synchronize_srcu()'s wakeup to be delivered to a CPU that was completely uninvolved in the prior SRCU grace period, which could mean that the code following that synchronize_srcu() would end up being unordered with respect to both the grace period and any pre-existing SRCU read-side critical sections. This commit therefore adds an smp_mb() to the end of __synchronize_srcu(), which prevents this scenario from occurring. Reported-by: Lance Roy Signed-off-by: Paul E. McKenney Acked-by: Lance Roy Cc: # 4.12.x --- kernel/rcu/srcutree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 94bd6ed43ea3..c1c0ee3cce3b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -897,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); wait_for_completion(&rcu.completion); destroy_rcu_head_on_stack(&rcu.head); + + /* + * Make sure that later code is ordered after the SRCU grace + * period. This pairs with the raw_spin_lock_irq_rcu_node() + * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed + * because the current CPU might have been totally uninvolved with + * (and thus unordered against) that grace period. + */ + smp_mb(); } /** -- cgit v1.2.3 From 1ad0f0a7aa1bf3bd42dcd108a96713d255eacd9f Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Thu, 27 Jul 2017 16:27:14 -0500 Subject: workqueue: Work around edge cases for calc of pool's cpumask There is an underlying assumption/trade-off in many layers of the Linux system that CPU <-> node mapping is static. This is despite the presence of features like NUMA and 'hotplug' that support the dynamic addition/ removal of fundamental system resources like CPUs and memory. PowerPC systems, however, do provide extensive features for the dynamic change of resources available to a system. Currently, there is little or no synchronization protection around the updating of the CPU <-> node mapping, and the export/update of this information for other layers / modules. In systems which can change this mapping during 'hotplug', like PowerPC, the information is changing underneath all layers that might reference it. This patch attempts to ensure that a valid, usable cpumask attribute is used by the workqueue infrastructure when setting up new resource pools. It prevents a crash that has been observed when an 'empty' cpumask is passed along to the worker/task scheduling code. It is intended as a temporary workaround until a more fundamental review and correction of the issue can be done. [With additions to the patch provided by Tejun Hao ] Signed-off-by: Michael Bringmann Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7146ea70a62d..ca937b0c3a96 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, /* yeap, return possible CPUs in @node that @attrs wants */ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + + if (cpumask_empty(cpumask)) { + pr_warn_once("WARNING: workqueue cpumask: online intersect > " + "possible intersect\n"); + return false; + } + return !cpumask_equal(cpumask, attrs->cpumask); use_dfl: -- cgit v1.2.3 From 955dbdf4ce87fd9be4bc8378e26b8c2eb8b3d184 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 17 Jun 2017 08:10:08 -0400 Subject: sched: Allow migrating kthreads into online but inactive CPUs Per-cpu workqueues have been tripping CPU affinity sanity checks while a CPU is being offlined. A per-cpu kworker ends up running on a CPU which isn't its target CPU while the CPU is online but inactive. While the scheduler allows kthreads to wake up on an online but inactive CPU, it doesn't allow a running kthread to be migrated to such a CPU, which leads to an odd situation where setting affinity on a sleeping and running kthread leads to different results. Each mem-reclaim workqueue has one rescuer which guarantees forward progress and the rescuer needs to bind itself to the CPU which needs help in making forward progress; however, due to the above issue, while set_cpus_allowed_ptr() succeeds, the rescuer doesn't end up on the correct CPU if the CPU is in the process of going offline, tripping the sanity check and executing the work item on the wrong CPU. This patch updates __migrate_task() so that kthreads can be migrated into an inactive but online CPU. Signed-off-by: Tejun Heo Reported-by: "Paul E. McKenney" Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b427b4..bfee6ea7db49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -951,8 +951,13 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (unlikely(!cpu_active(dest_cpu))) - return rq; + if (p->flags & PF_KTHREAD) { + if (unlikely(!cpu_online(dest_cpu))) + return rq; + } else { + if (unlikely(!cpu_active(dest_cpu))) + return rq; + } /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) -- cgit v1.2.3 From aa8188253474b4053bc2900d9fcb545ce68bdf5c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:51 -0700 Subject: kernfs: add exportfs operations Now we have the facilities to implement exportfs operations. The idea is cgroup can export the fhandle info to userspace, then userspace uses fhandle to find the cgroup name. Another example is userspace can get fhandle for a cgroup and BPF uses the fhandle to filter info for the cgroup. Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 620794a20a33..6cefa277f39c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1737,7 +1737,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, - KERNFS_ROOT_CREATE_DEACTIVATED, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_SUPPORT_EXPORTOP, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); -- cgit v1.2.3 From ca1136c99b66b1566781ff12ecddc635d570f932 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:53 -0700 Subject: blktrace: export cgroup info in trace Currently blktrace isn't cgroup aware. blktrace prints out task name of current context, but the task of current context isn't always in the cgroup where the BIO comes from. We can't use task name to find out IO cgroup. For example, Writeback BIOs always comes from flusher thread but the BIOs are for different blk cgroups. Request could be requeued and dispatched from completely different tasks. MD/DM are another examples. This patch tries to fix the gap. We print out cgroup fhandle info in blktrace. Userspace can use open_by_handle_at() syscall to find the cgroup by fhandle. Or userspace can use name_to_handle_at() syscall to find fhandle for a cgroup and use a BPF program to filter out blktrace for a specific cgroup. We add a new 'blk_cgroup' trace option for blk tracer. It's default off. Application which doesn't know the new option isn't affected. When it's on, we output fhandle info right after blk_io_trace with an extra bit set in event action. So from application point of view, blktrace with the option will output new actions. I didn't change blk trace event yet, since I'm not sure if changing the trace event output is an ABI issue. If not, I'll do it later. Acked-by: Steven Rostedt (VMware) Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 231 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 158 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc364f86100a..f393d7a43695 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "../../block/blk.h" @@ -46,10 +47,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 +#define TRACE_BLK_OPT_CGROUP 0x2 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, +#ifdef CONFIG_BLK_CGROUP + { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, +#endif { } }; @@ -68,7 +73,8 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) + const void *data, size_t len, + union kernfs_node_id *cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -76,12 +82,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len, + sizeof(*t) + len + cgid_len, 0, pc); if (!event) return; @@ -92,17 +99,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len); + t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); record_it: t->device = bt->dev; - t->action = action; + t->action = action | (cgid ? __BLK_TN_CGROUP : 0); t->pid = pid; t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); + t->pdu_len = len + cgid_len; + if (cgid) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -122,7 +131,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm)); + sizeof(tsk->comm), NULL); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +148,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); local_irq_restore(flags); } @@ -167,7 +176,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -204,7 +213,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data) + void *pdu_data, union kernfs_node_id *cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -215,6 +224,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -229,6 +239,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); + if (cgid) + what |= __BLK_TA_CGROUP; pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -241,7 +253,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len, + sizeof(*t) + pdu_len + cgid_len, 0, pc); if (!event) return; @@ -258,7 +270,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -280,10 +292,12 @@ record_it: t->action = what; t->device = bt->dev; t->error = error; - t->pdu_len = pdu_len; + t->pdu_len = pdu_len + cgid_len; + if (cgid_len) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -684,6 +698,36 @@ void blk_trace_shutdown(struct request_queue *q) } } +#ifdef CONFIG_BLK_CGROUP +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + struct blk_trace *bt = q->blk_trace; + + if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + return NULL; + + if (!bio->bi_css) + return NULL; + return cgroup_get_kernfs_id(bio->bi_css->cgroup); +} +#else +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + return NULL; +} +#endif + +static union kernfs_node_id * +blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) +{ + if (!rq->bio) + return NULL; + /* Use the first bio */ + return blk_trace_bio_get_cgid(q, rq->bio); +} + /* * blktrace probes */ @@ -694,13 +738,15 @@ void blk_trace_shutdown(struct request_queue *q) * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action + * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what) + unsigned int nr_bytes, u32 what, + union kernfs_node_id *cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -713,32 +759,36 @@ static void blk_add_trace_rq(struct request *rq, int error, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, error, 0, NULL); + rq->cmd_flags, what, error, 0, NULL, cgid); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, int error, unsigned int nr_bytes) { - blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, + blk_trace_request_get_cgid(rq->q, rq)); } /** @@ -753,7 +803,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u32 what, int error, union kernfs_node_id *cgid) { struct blk_trace *bt = q->blk_trace; @@ -761,20 +811,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio, int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_backmerge(void *ignore, @@ -782,7 +834,8 @@ static void blk_add_trace_bio_backmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_frontmerge(void *ignore, @@ -790,13 +843,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_getrq(void *ignore, @@ -804,13 +859,14 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL); + NULL, NULL); } } @@ -820,13 +876,14 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL); + 0, 0, NULL, NULL); } } @@ -835,7 +892,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -852,7 +909,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); } } @@ -868,7 +925,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), - &rpdu); + &rpdu, blk_trace_bio_get_cgid(q, bio)); } } @@ -901,7 +958,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, - sizeof(r), &r); + sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); } /** @@ -934,7 +991,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, - sizeof(r), &r); + sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); } /** @@ -958,7 +1015,8 @@ void blk_add_driver_data(struct request_queue *q, return; __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, 0, len, data); + BLK_TA_DRV_DATA, 0, len, data, + blk_trace_request_get_cgid(q, rq)); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1031,7 +1089,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) int i = 0; int tc = t->action >> BLK_TC_SHIFT; - if (t->action == BLK_TN_MESSAGE) { + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { rwbs[i++] = 'N'; goto out; } @@ -1066,9 +1124,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) return (const struct blk_io_trace *)ent; } -static inline const void *pdu_start(const struct trace_entry *ent) +static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent) + 1; + return (void *)(te_blk_io_trace(ent) + 1) + + (has_cg ? sizeof(union kernfs_node_id) : 0); +} + +static inline const void *cgid_start(const struct trace_entry *ent) +{ + return (void *)(te_blk_io_trace(ent) + 1); +} + +static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) +{ + return te_blk_io_trace(ent)->pdu_len - + (has_cg ? sizeof(union kernfs_node_id) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1096,16 +1166,16 @@ static inline __u16 t_error(const struct trace_entry *ent) return te_blk_io_trace(ent)->error; } -static __u64 get_pdu_int(const struct trace_entry *ent) +static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { - const __u64 *val = pdu_start(ent); + const __u64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r) + struct blk_io_trace_remap *r, bool has_cg) { - const struct blk_io_trace_remap *__r = pdu_start(ent); + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); __u64 sector_from = __r->sector_from; r->device_from = be32_to_cpu(__r->device_from); @@ -1113,9 +1183,11 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector_from = be64_to_cpu(sector_from); } -typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, + bool has_cg); -static void blk_log_action_classic(struct trace_iterator *iter, const char *act) +static void blk_log_action_classic(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; @@ -1131,24 +1203,33 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act) secs, nsec_rem, iter->ent->pid, act, rwbs); } -static void blk_log_action(struct trace_iterator *iter, const char *act) +static void blk_log_action(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); - trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); + if (has_cg) { + const union kernfs_node_id *id = cgid_start(iter->ent); + + trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ", + MAJOR(t->device), MINOR(t->device), + id->ino, id->generation, act, rwbs); + } else + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); } -static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_dump_pdu(struct trace_seq *s, + const struct trace_entry *ent, bool has_cg) { const unsigned char *pdu_buf; int pdu_len; int i, end; - pdu_buf = pdu_start(ent); - pdu_len = te_blk_io_trace(ent)->pdu_len; + pdu_buf = pdu_start(ent, has_cg); + pdu_len = pdu_real_len(ent, has_cg); if (!pdu_len) return; @@ -1179,7 +1260,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) trace_seq_puts(s, ") "); } -static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1187,7 +1268,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { trace_seq_printf(s, "%u ", t_bytes(ent)); - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) @@ -1199,10 +1280,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) } static void blk_log_with_error(struct trace_seq *s, - const struct trace_entry *ent) + const struct trace_entry *ent, bool has_cg) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) @@ -1215,18 +1296,18 @@ static void blk_log_with_error(struct trace_seq *s, } } -static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { struct blk_io_trace_remap r = { .device_from = 0, }; - get_pdu_remap(ent, &r); + get_pdu_remap(ent, &r, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), MAJOR(r.device_from), MINOR(r.device_from), (unsigned long long)r.sector_from); } -static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1235,30 +1316,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) trace_seq_printf(s, "[%s]\n", cmd); } -static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); } -static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); + get_pdu_int(ent, has_cg), cmd); } -static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg) { - const struct blk_io_trace *t = te_blk_io_trace(ent); - trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putmem(s, pdu_start(ent, has_cg), + pdu_real_len(ent, has_cg)); trace_seq_putc(s, '\n'); } @@ -1298,7 +1380,8 @@ static void blk_tracer_reset(struct trace_array *tr) static const struct { const char *act[2]; - void (*print)(struct trace_seq *s, const struct trace_entry *ent); + void (*print)(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, @@ -1326,23 +1409,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, u16 what; bool long_act; blk_log_action_t *log_action; + bool has_cg; t = te_blk_io_trace(iter->ent); - what = t->action & ((1 << BLK_TC_SHIFT) - 1); + what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; + has_cg = t->action & __BLK_TA_CGROUP; - if (t->action == BLK_TN_MESSAGE) { - log_action(iter, long_act ? "message" : "m"); - blk_log_msg(s, iter->ent); + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { + log_action(iter, long_act ? "message" : "m", has_cg); + blk_log_msg(s, iter->ent, has_cg); return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) trace_seq_printf(s, "Unknown action %x\n", what); else { - log_action(iter, what2act[what].act[long_act]); - what2act[what].print(s, iter->ent); + log_action(iter, what2act[what].act[long_act], has_cg); + what2act[what].print(s, iter->ent, has_cg); } return trace_handle_return(s); -- cgit v1.2.3 From 69fd5c391763bd94a40dd152bc72a7f230137150 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:55 -0700 Subject: blktrace: add an option to allow displaying cgroup path By default we output cgroup id in blktrace. This adds an option to display cgroup path. Since get cgroup path is a relativly heavy operation, we don't enable it by default. with the option enabled, blktrace will output something like this: dd-1353 [007] d..2 293.015252: 8,0 /test/level D R 24 + 8 [dd] Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 12 ++++++++++++ kernel/trace/blktrace.c | 14 +++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6cefa277f39c..2aba1c519138 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4701,6 +4701,18 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) +{ + struct kernfs_node *kn; + + kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + if (!kn) + return; + kernfs_path(kn, buf, buflen); + kernfs_put(kn); +} + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f393d7a43695..e90974ed4532 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -48,12 +48,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 #define TRACE_BLK_OPT_CGROUP 0x2 +#define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, #ifdef CONFIG_BLK_CGROUP { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, + { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, #endif { } }; @@ -1213,7 +1215,17 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, if (has_cg) { const union kernfs_node_id *id = cgid_start(iter->ent); - trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ", + if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { + char blkcg_name_buf[NAME_MAX + 1] = "<...>"; + + cgroup_path_from_kernfs_id(id, blkcg_name_buf, + sizeof(blkcg_name_buf)); + trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", + MAJOR(t->device), MINOR(t->device), + blkcg_name_buf, act, rwbs); + } else + trace_seq_printf(&iter->seq, + "%3d,%-3d %x,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), id->ino, id->generation, act, rwbs); } else -- cgit v1.2.3 From 35fe6d763229e8fc0eb5f9b93a401673cfcb5e1e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:56 -0700 Subject: block: use standard blktrace API to output cgroup info for debug notes Currently cfq/bfq/blk-throttle output cgroup info in trace in their own way. Now we have standard blktrace API for this, so convert them to use it. Note, this changes the behavior a little bit. cgroup info isn't output by default, we only do this with 'blk_cgroup' option enabled. cgroup info isn't output as a string by default too, we only do this with 'blk_cgname' option enabled. Also cgroup info is output in different position of the note string. I think these behavior changes aren't a big issue (actually we make trace data shorter which is good), since the blktrace note is solely for debugging. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e90974ed4532..7724de18d2fe 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -154,7 +154,8 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, + const char *fmt, ...) { int n; va_list args; @@ -178,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + blkcg = NULL; +#ifdef CONFIG_BLK_CGROUP + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); +#else trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -375,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, "%s", msg); + __trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; -- cgit v1.2.3 From 89b096898a8450b0a5b97d521e000ae9f94f81f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 27 Jul 2017 21:02:46 +0200 Subject: bpf: don't indicate success when copy_from_user fails err in bpf_prog_get_info_by_fd() still holds 0 at that time from prior check_uarg_tail_zero() check. Explicitly return -EFAULT instead, so user space can be notified of buggy behavior. Fixes: 1e2709769086 ("bpf: Add BPF_OBJ_GET_INFO_BY_FD") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 045646da97cc..84bb39975ad4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1289,7 +1289,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info_len = min_t(u32, sizeof(info), info_len); if (copy_from_user(&info, uinfo, info_len)) - return err; + return -EFAULT; info.type = prog->type; info.id = prog->aux->id; -- cgit v1.2.3 From 9975a54b3c9ecf029cbf5dd7a8c9701b1d74029e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 Jul 2017 17:05:25 +0200 Subject: bpf: fix bpf_prog_get_info_by_fd to dump correct xlated_prog_len bpf_prog_size(prog->len) is not the correct length we want to dump back to user space. The code in bpf_prog_get_info_by_fd() uses this to copy prog->insnsi to user space, but bpf_prog_size(prog->len) also includes the size of struct bpf_prog itself plus program instructions and is usually used either in context of accounting or for bpf_prog_alloc() et al, thus we copy out of bounds in bpf_prog_get_info_by_fd() potentially. Use the correct bpf_prog_insn_size() instead. Fixes: 1e2709769086 ("bpf: Add BPF_OBJ_GET_INFO_BY_FD") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 84bb39975ad4..6c772adabad2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1312,7 +1312,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.xlated_prog_len; - info.xlated_prog_len = bpf_prog_size(prog->len); + info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); -- cgit v1.2.3 From 313c8c16ee62b32b8b40c6b00637b401dc19050e Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 28 Jul 2017 15:09:25 +0800 Subject: PM / CPU: replace raw_notifier with atomic_notifier This patch replaces an rwlock and raw notifier by an atomic notifier protected by a spin_lock and RCU. The main reason for this change is due to a 'scheduling while atomic' bug with RT kernels on ARM/ARM64. On ARM/ARM64, the rwlock cpu_pm_notifier_lock in cpu_pm_enter/exit() causes a potential schedule after IRQ disable in the idle call chain: cpu_startup_entry cpu_idle_loop local_irq_disable() cpuidle_idle_call call_cpuidle cpuidle_enter cpuidle_enter_state ->enter :arm_enter_idle_state cpu_pm_enter/exit CPU_PM_CPU_IDLE_ENTER read_lock(&cpu_pm_notifier_lock); <-- sleep in idle __rt_spin_lock(); schedule(); The kernel panic is here: [ 4.609601] BUG: scheduling while atomic: swapper/1/0/0x00000002 [ 4.609608] [] arm_enter_idle_state+0x18/0x70 [ 4.609614] Modules linked in: [ 4.609615] [] cpuidle_enter_state+0xf0/0x218 [ 4.609620] [] cpuidle_enter+0x18/0x20 [ 4.609626] Preemption disabled at: [ 4.609627] [] call_cpuidle+0x24/0x40 [ 4.609635] [] schedule_preempt_disabled+0x1c/0x28 [ 4.609639] [] cpu_startup_entry+0x154/0x1f8 [ 4.609645] [] secondary_start_kernel+0x15c/0x1a0 Daniel Lezcano said this notification is needed on arm/arm64 platforms. Sebastian suggested using atomic_notifier instead of rwlock, which is not only removing the sleeping in idle, but also improving latency. Tony Lindgren found a miss use that rcu_read_lock used after rcu_idle_enter Paul McKenney suggested trying RCU_NONIDLE. Signed-off-by: Alex Shi Tested-by: Tony Lindgren Acked-by: Sebastian Andrzej Siewior [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/cpu_pm.c | 50 +++++++++++++------------------------------------- 1 file changed, 13 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 009cc9a17d95..67b02e138a47 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -22,15 +22,21 @@ #include #include -static DEFINE_RWLOCK(cpu_pm_notifier_lock); -static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain); +static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain); static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) { int ret; - ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, + /* + * __atomic_notifier_call_chain has a RCU read critical section, which + * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let + * RCU know this. + */ + rcu_irq_enter_irqson(); + ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, nr_to_call, nr_calls); + rcu_irq_exit_irqson(); return notifier_to_errno(ret); } @@ -47,14 +53,7 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) */ int cpu_pm_register_notifier(struct notifier_block *nb) { - unsigned long flags; - int ret; - - write_lock_irqsave(&cpu_pm_notifier_lock, flags); - ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb); - write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); - - return ret; + return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb); } EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); @@ -69,14 +68,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); */ int cpu_pm_unregister_notifier(struct notifier_block *nb) { - unsigned long flags; - int ret; - - write_lock_irqsave(&cpu_pm_notifier_lock, flags); - ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); - write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); - - return ret; + return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); } EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); @@ -100,7 +92,6 @@ int cpu_pm_enter(void) int nr_calls; int ret = 0; - read_lock(&cpu_pm_notifier_lock); ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); if (ret) /* @@ -108,7 +99,6 @@ int cpu_pm_enter(void) * PM entry who are notified earlier to prepare for it. */ cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); - read_unlock(&cpu_pm_notifier_lock); return ret; } @@ -128,13 +118,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter); */ int cpu_pm_exit(void) { - int ret; - - read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL); - read_unlock(&cpu_pm_notifier_lock); - - return ret; + return cpu_pm_notify(CPU_PM_EXIT, -1, NULL); } EXPORT_SYMBOL_GPL(cpu_pm_exit); @@ -159,7 +143,6 @@ int cpu_cluster_pm_enter(void) int nr_calls; int ret = 0; - read_lock(&cpu_pm_notifier_lock); ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); if (ret) /* @@ -167,7 +150,6 @@ int cpu_cluster_pm_enter(void) * PM entry who are notified earlier to prepare for it. */ cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); - read_unlock(&cpu_pm_notifier_lock); return ret; } @@ -190,13 +172,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); */ int cpu_cluster_pm_exit(void) { - int ret; - - read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); - read_unlock(&cpu_pm_notifier_lock); - - return ret; + return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); } EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); -- cgit v1.2.3 From 34f41c0316ed52b0b44542491d89278efdaa70e4 Mon Sep 17 00:00:00 2001 From: Matija Glavinic Pecotic Date: Tue, 1 Aug 2017 09:11:52 +0200 Subject: timers: Fix overflow in get_next_timer_interrupt For e.g. HZ=100, timer being 430 jiffies in the future, and 32 bit unsigned int, there is an overflow on unsigned int right-hand side of the expression which results with wrong values being returned. Type cast the multiplier to 64bit to avoid that issue. Fixes: 46c8f0b077a8 ("timers: Fix get_next_timer_interrupt() computation") Signed-off-by: Matija Glavinic Pecotic Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Sverdlin Cc: khilman@baylibre.com Cc: akpm@linux-foundation.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/a7900f04-2a21-c9fd-67be-ab334d459ee5@nokia.com --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 71ce3f4eead3..8f5d1bf18854 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1495,7 +1495,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) base->is_idle = false; } else { if (!is_max_delta) - expires = basem + (nextevt - basej) * TICK_NSEC; + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* * If we expect to sleep more than a tick, mark the base idle: */ -- cgit v1.2.3 From 674e75411fc260b0d4532701228cfe12fc090da8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 28 Jul 2017 12:16:38 +0530 Subject: sched: cpufreq: Allow remote cpufreq callbacks With Android UI and benchmarks the latency of cpufreq response to certain scheduling events can become very critical. Currently, callbacks into cpufreq governors are only made from the scheduler if the target CPU of the event is the same as the current CPU. This means there are certain situations where a target CPU may not run the cpufreq governor for some time. One testcase to show this behavior is where a task starts running on CPU0, then a new task is also spawned on CPU0 by a task on CPU1. If the system is configured such that the new tasks should receive maximum demand initially, this should result in CPU0 increasing frequency immediately. But because of the above mentioned limitation though, this does not occur. This patch updates the scheduler core to call the cpufreq callbacks for remote CPUs as well. The schedutil, ondemand and conservative governors are updated to process cpufreq utilization update hooks called for remote CPUs where the remote CPU is managed by the cpufreq policy of the local CPU. The intel_pstate driver is updated to always reject remote callbacks. This is tested with couple of usecases (Android: hackbench, recentfling, galleryfling, vellamo, Ubuntu: hackbench) on ARM hikey board (64 bit octa-core, single policy). Only galleryfling showed minor improvements, while others didn't had much deviation. The reason being that this patch only targets a corner case, where following are required to be true to improve performance and that doesn't happen too often with these tests: - Task is migrated to another CPU. - The task has high demand, and should take the target CPU to higher OPPs. - And the target CPU doesn't call into the cpufreq governor until the next tick. Based on initial work from Steve Muckle. Signed-off-by: Viresh Kumar Acked-by: Saravana Kannan Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 31 ++++++++++++++++++++++++++----- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 8 +++++--- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 10 ++-------- 5 files changed, 35 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index ddd385f2a985..7dbc76801f86 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -52,6 +52,7 @@ struct sugov_policy { struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; + unsigned int cpu; bool iowait_boost_pending; unsigned int iowait_boost; @@ -77,6 +78,21 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) { s64 delta_ns; + /* + * Since cpufreq_update_util() is called with rq->lock held for + * the @target_cpu, our per-cpu data is fully serialized. + * + * However, drivers cannot in general deal with cross-cpu + * requests, so while get_next_freq() will work, our + * sugov_update_commit() call may not. + * + * Hence stop here for remote requests if they aren't supported + * by the hardware, as calculating the frequency is pointless if + * we cannot in fact act on it. + */ + if (!cpufreq_can_do_remote_dvfs(sg_policy->policy)) + return false; + if (sg_policy->work_in_progress) return false; @@ -155,12 +171,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(unsigned long *util, unsigned long *max) +static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) { - struct rq *rq = this_rq(); + struct rq *rq = cpu_rq(cpu); unsigned long cfs_max; - cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); + cfs_max = arch_scale_cpu_capacity(NULL, cpu); *util = min(rq->cfs.avg.util_avg, cfs_max); *max = cfs_max; @@ -254,7 +270,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_RT_DL) { next_f = policy->cpuinfo.max_freq; } else { - sugov_get_util(&util, &max); + sugov_get_util(&util, &max, sg_cpu->cpu); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -316,7 +332,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, unsigned long util, max; unsigned int next_f; - sugov_get_util(&util, &max); + sugov_get_util(&util, &max, sg_cpu->cpu); raw_spin_lock(&sg_policy->update_lock); @@ -697,6 +713,11 @@ struct cpufreq_governor *cpufreq_default_governor(void) static int __init sugov_register(void) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(sugov_cpu, cpu).cpu = cpu; + return cpufreq_register_governor(&schedutil_gov); } fs_initcall(sugov_register); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 755bd3f1a1a9..5c3bf4bd0327 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1136,7 +1136,7 @@ static void update_curr_dl(struct rq *rq) } /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); + cpufreq_update_util(rq, SCHED_CPUFREQ_DL); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c95880e216f6..d378d02fdfcb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3278,7 +3278,9 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) { - if (&this_rq()->cfs == cfs_rq) { + struct rq *rq = rq_of(cfs_rq); + + if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3295,7 +3297,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * See cpu_util(). */ - cpufreq_update_util(rq_of(cfs_rq), 0); + cpufreq_update_util(rq, 0); } } @@ -4875,7 +4877,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * passed. */ if (p->in_iowait) - cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); + cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); for_each_sched_entity(se) { if (se->on_rq) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 45caf937ef90..0af5ca9e3e3f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -970,7 +970,7 @@ static void update_curr_rt(struct rq *rq) return; /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); + cpufreq_update_util(rq, SCHED_CPUFREQ_RT); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeef1a3086d1..aa9d5b87b4f8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2070,19 +2070,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); if (data) data->func(data, rq_clock(rq), flags); } - -static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -{ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_update_util(rq, flags); -} #else static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity -- cgit v1.2.3 From bc2eecd7ecce40af43b6eb3d256b6076257df846 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 1 Aug 2017 00:31:32 -0400 Subject: futex: Allow for compiling out PI support This makes it possible to preserve basic futex support and compile out the PI support when RT mutexes are not available. Signed-off-by: Nicolas Pitre Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Darren Hart Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1708010024190.5981@knanqh.ubzr --- kernel/futex.c | 22 ++++++++++++++++++++++ kernel/locking/rtmutex_common.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 16dbe4c93895..ad0af4df1b9d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -875,6 +875,8 @@ static struct task_struct *futex_find_get_task(pid_t pid) return p; } +#ifdef CONFIG_FUTEX_PI + /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. @@ -932,6 +934,8 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); } +#endif + /* * We need to check the following states: * @@ -1799,6 +1803,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + /* + * When PI not supported: return -ENOSYS if requeue_pi is true, + * consequently the compiler knows requeue_pi is always false past + * this point which will optimize away all the conditional code + * further down. + */ + if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) + return -ENOSYS; + if (requeue_pi) { /* * Requeue PI only works on two distinct uaddrs. This @@ -2594,6 +2607,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + if (refill_pi_state_cache()) return -ENOMEM; @@ -2773,6 +2789,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) struct futex_q *top_waiter; int ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + retry: if (get_user(uval, uaddr)) return -EFAULT; @@ -2983,6 +3002,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + if (uaddr == uaddr2) return -EINVAL; diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 72ad45a9a794..8d039b928d61 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -40,6 +40,9 @@ struct rt_mutex_waiter { /* * Various helpers to access the waiters-tree: */ + +#ifdef CONFIG_RT_MUTEXES + static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return !RB_EMPTY_ROOT(&lock->waiters); @@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p) pi_tree_entry); } +#else + +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return false; +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + return NULL; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return false; +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return NULL; +} + +#endif + /* * lock->owner state tracking: */ -- cgit v1.2.3 From c39a0e2c8850f08249383f2425dbd8dbe4baad69 Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Tue, 25 Jul 2017 14:14:20 -0700 Subject: x86/perf/cqm: Wipe out perf based cqm 'perf cqm' never worked due to the incompatibility between perf infrastructure and cqm hardware support. The hardware uses RMIDs to track the llc occupancy of tasks and these RMIDs are per package. This makes monitoring a hierarchy like cgroup along with monitoring of tasks separately difficult and several patches sent to lkml to fix them were NACKed. Further more, the following issues in the current perf cqm make it almost unusable: 1. No support to monitor the same group of tasks for which we do allocation using resctrl. 2. It gives random and inaccurate data (mostly 0s) once we run out of RMIDs due to issues in Recycling. 3. Recycling results in inaccuracy of data because we cannot guarantee that the RMID was stolen from a task when it was not pulling data into cache or even when it pulled the least data. Also for monitoring llc_occupancy, if we stop using an RMID_x and then start using an RMID_y after we reclaim an RMID from an other event, we miss accounting all the occupancy that was tagged to RMID_x at a later perf_count. 2. Recycling code makes the monitoring code complex including scheduling because the event can lose RMID any time. Since MBM counters count bandwidth for a period of time by taking snap shot of total bytes at two different times, recycling complicates the way we count MBM in a hierarchy. Also we need a spin lock while we do the processing to account for MBM counter overflow. We also currently use a spin lock in scheduling to prevent the RMID from being taken away. 4. Lack of support when we run different kind of event like task, system-wide and cgroup events together. Data mostly prints 0s. This is also because we can have only one RMID tied to a cpu as defined by the cqm hardware but a perf can at the same time tie multiple events during one sched_in. 5. No support of monitoring a group of tasks. There is partial support for cgroup but it does not work once there is a hierarchy of cgroups or if we want to monitor a task in a cgroup and the cgroup itself. 6. No support for monitoring tasks for the lifetime without perf overhead. 7. It reported the aggregate cache occupancy or memory bandwidth over all sockets. But most cloud and VMM based use cases want to know the individual per-socket usage. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: davidcc@google.com Cc: reinette.chatre@intel.com Link: http://lkml.kernel.org/r/1501017287-28083-2-git-send-email-vikas.shivappa@linux.intel.com --- kernel/events/core.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..6e171540c0af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3625,10 +3625,7 @@ unlock: static inline u64 perf_event_count(struct perf_event *event) { - if (event->pmu->count) - return event->pmu->count(event); - - return __perf_event_count(event); + return local64_read(&event->count) + atomic64_read(&event->child_count); } /* @@ -3659,15 +3656,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } - /* - * It must not have a pmu::count method, those are not - * NMI safe. - */ - if (event->pmu->count) { - ret = -EOPNOTSUPP; - goto out; - } - /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { -- cgit v1.2.3 From 4bb0f0e73c8c30917d169c4a0f1ac083690c545b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 1 Aug 2017 12:01:52 -0400 Subject: tracing: Call clear_boot_tracer() at lateinit_sync The clear_boot_tracer function is used to reset the default_bootup_tracer string to prevent it from being accessed after boot, as it originally points to init data. But since clear_boot_tracer() is called via the init_lateinit() call, it races with the initcall for registering the hwlat tracer. If someone adds "ftrace=hwlat" to the kernel command line, depending on how the linker sets up the text, the saved command line may be cleared, and the hwlat tracer never is initialized. Simply have the clear_boot_tracer() be called by initcall_lateinit_sync() as that's for tasks to be called after lateinit. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196551 Cc: stable@vger.kernel.org Fixes: e7c15cd8a ("tracing: Added hardware latency tracer") Reported-by: Zamir SUN Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42b9355033d4..784fb43b2abe 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8407,4 +8407,4 @@ __init static int clear_boot_tracer(void) } fs_initcall(tracer_init_tracefs); -late_initcall(clear_boot_tracer); +late_initcall_sync(clear_boot_tracer); -- cgit v1.2.3 From 147d88e0b5eb90191bc5c12ca0a3c410b75a13d2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 1 Aug 2017 14:02:01 +0300 Subject: tracing: Missing error code in tracer_alloc_buffers() If ring_buffer_alloc() or one of the next couple function calls fail then we should return -ENOMEM but the current code returns success. Link: http://lkml.kernel.org/r/20170801110201.ajdkct7vwzixahvx@mwanda Cc: Sebastian Andrzej Siewior Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: b32614c03413 ('tracing/rb: Convert to hotplug state machine') Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 784fb43b2abe..d815fc317e9d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8293,6 +8293,7 @@ __init static int tracer_alloc_buffers(void) if (ret < 0) goto out_free_cpumask; /* Used for event triggers */ + ret = -ENOMEM; temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) goto out_rm_hp_state; -- cgit v1.2.3 From a7e52ad7ed82e21273eccff93d1477a7b313aabb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 2 Aug 2017 14:20:54 -0400 Subject: ring-buffer: Have ring_buffer_alloc_read_page() return error on offline CPU Chunyu Hu reported: "per_cpu trace directories and files are created for all possible cpus, but only the cpus which have ever been on-lined have their own per cpu ring buffer (allocated by cpuhp threads). While trace_buffers_open, the open handler for trace file 'trace_pipe_raw' is always trying to access field of ring_buffer_per_cpu, and would panic with the NULL pointer. Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV when openning it if that cpu does not have trace ring buffer. Reproduce: cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw (cpu31 is never on-lined, this is a 16 cores x86_64 box) Tested with: 1) boot with maxcpus=14, read trace_pipe_raw of cpu15. Got -NODEV. 2) oneline cpu15, read trace_pipe_raw of cpu15. Get the raw trace data. Call trace: [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0 [ 5760.961678] tracing_buffers_read+0x1f6/0x230 [ 5760.962695] __vfs_read+0x37/0x160 [ 5760.963498] ? __vfs_read+0x5/0x160 [ 5760.964339] ? security_file_permission+0x9d/0xc0 [ 5760.965451] ? __vfs_read+0x5/0x160 [ 5760.966280] vfs_read+0x8c/0x130 [ 5760.967070] SyS_read+0x55/0xc0 [ 5760.967779] do_syscall_64+0x67/0x150 [ 5760.968687] entry_SYSCALL64_slow_path+0x25/0x25" This was introduced by the addition of the feature to reuse reader pages instead of re-allocating them. The problem is that the allocation of a reader page (which is per cpu) does not check if the cpu is online and set up for the ring buffer. Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer") Reported-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 14 +++++++++----- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 16 +++++++++++----- 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 529cc50d7243..81279c6602ff 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * the page that was allocated, with the read page of the buffer. * * Returns: - * The page allocated, or NULL on error. + * The page allocated, or ERR_PTR */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = NULL; unsigned long flags; struct page *page; + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-ENODEV); + + cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) - return NULL; + return ERR_PTR(-ENOMEM); bpage = page_address(page); @@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * * for example: * rpage = ring_buffer_alloc_read_page(buffer, cpu); - * if (!rpage) - * return error; + * if (IS_ERR(rpage)) + * return PTR_ERR(rpage); * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); * if (ret >= 0) * process_page(rpage, ret); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 9fbcaf567886..68ee79afe31c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -113,7 +113,7 @@ static enum event_status read_page(int cpu) int i; bpage = ring_buffer_alloc_read_page(buffer, cpu); - if (!bpage) + if (IS_ERR(bpage)) return EVENT_DROPPED; ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d815fc317e9d..44004d8aa3b3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; - ssize_t ret; + ssize_t ret = 0; ssize_t size; if (!count) @@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); - info->spare_cpu = iter->cpu_file; + if (IS_ERR(info->spare)) { + ret = PTR_ERR(info->spare); + info->spare = NULL; + } else { + info->spare_cpu = iter->cpu_file; + } } if (!info->spare) - return -ENOMEM; + return ret; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) @@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ref->ref = 1; ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); - if (!ref->page) { - ret = -ENOMEM; + if (IS_ERR(ref->page)) { + ret = PTR_ERR(ref->page); + ref->page = NULL; kfree(ref); break; } -- cgit v1.2.3 From 0679dee03c6d706d57145ea92c23d08fa10a1999 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 2 Aug 2017 17:55:29 +0100 Subject: cgroup: keep track of number of descent cgroups Keep track of the number of online and dying descent cgroups. This data will be used later to add an ability to control cgroup hierarchy (limit the depth and the number of descent cgroups) and display hierarchy stats. Signed-off-by: Roman Gushchin Suggested-by: Tejun Heo Signed-off-by: Tejun Heo Cc: Zefan Li Cc: Waiman Long Cc: Johannes Weiner Cc: kernel-team@fb.com Cc: cgroups@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- kernel/cgroup/cgroup.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 85f6a112344b..cfdbb1e780de 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4408,9 +4408,15 @@ static void css_release_work_fn(struct work_struct *work) if (ss->css_released) ss->css_released(css); } else { + struct cgroup *tcgrp; + /* cgroup release path */ trace_cgroup_release(cgrp); + for (tcgrp = cgroup_parent(cgrp); tcgrp; + tcgrp = cgroup_parent(tcgrp)) + tcgrp->nr_dying_descendants--; + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4609,9 +4615,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->root = root; cgrp->level = level; - for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + if (tcgrp != cgrp) + tcgrp->nr_descendants++; + } + if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4817,7 +4827,7 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; @@ -4865,6 +4875,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (parent && cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; + for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { + tcgrp->nr_descendants--; + tcgrp->nr_dying_descendants++; + } + cgroup1_check_for_release(cgroup_parent(cgrp)); /* put the base reference */ -- cgit v1.2.3 From 1a926e0bbab83bae8207d05a533173425e0496d1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jul 2017 18:28:44 +0100 Subject: cgroup: implement hierarchy limits Creating cgroup hierearchies of unreasonable size can affect overall system performance. A user might want to limit the size of cgroup hierarchy. This is especially important if a user is delegating some cgroup sub-tree. To address this issue, introduce an ability to control the size of cgroup hierarchy. The cgroup.max.descendants control file allows to set the maximum allowed number of descendant cgroups. The cgroup.max.depth file controls the maximum depth of the cgroup tree. Both are single value r/w files, with "max" default value. The control files exist on each hierarchy level (including root). When a new cgroup is created, we check the total descendants and depth limits on each level, and if none of them are exceeded, a new cgroup is created. Only alive cgroups are counted, removed (dying) cgroups are ignored. Signed-off-by: Roman Gushchin Suggested-by: Tejun Heo Signed-off-by: Tejun Heo Cc: Zefan Li Cc: Waiman Long Cc: Johannes Weiner Cc: kernel-team@fb.com Cc: cgroups@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- kernel/cgroup/cgroup.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cfdbb1e780de..0fd9134e1720 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1827,6 +1827,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; cgrp->dom_cgrp = cgrp; + cgrp->max_descendants = INT_MAX; + cgrp->max_depth = INT_MAX; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -3209,6 +3211,92 @@ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static int cgroup_max_descendants_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + int descendants = READ_ONCE(cgrp->max_descendants); + + if (descendants == INT_MAX) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%d\n", descendants); + + return 0; +} + +static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + int descendants; + ssize_t ret; + + buf = strstrip(buf); + if (!strcmp(buf, "max")) { + descendants = INT_MAX; + } else { + ret = kstrtoint(buf, 0, &descendants); + if (ret) + return ret; + } + + if (descendants < 0 || descendants > INT_MAX) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgrp->max_descendants = descendants; + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + +static int cgroup_max_depth_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + int depth = READ_ONCE(cgrp->max_depth); + + if (depth == INT_MAX) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%d\n", depth); + + return 0; +} + +static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + ssize_t ret; + int depth; + + buf = strstrip(buf); + if (!strcmp(buf, "max")) { + depth = INT_MAX; + } else { + ret = kstrtoint(buf, 0, &depth); + if (ret) + return ret; + } + + if (depth < 0 || depth > INT_MAX) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgrp->max_depth = depth; + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + static int cgroup_events_show(struct seq_file *seq, void *v) { seq_printf(seq, "populated %d\n", @@ -4309,6 +4397,16 @@ static struct cftype cgroup_base_files[] = { .file_offset = offsetof(struct cgroup, events_file), .seq_show = cgroup_events_show, }, + { + .name = "cgroup.max.descendants", + .seq_show = cgroup_max_descendants_show, + .write = cgroup_max_descendants_write, + }, + { + .name = "cgroup.max.depth", + .seq_show = cgroup_max_depth_show, + .write = cgroup_max_depth_write, + }, { } /* terminate */ }; @@ -4662,6 +4760,29 @@ out_free_cgrp: return ERR_PTR(ret); } +static bool cgroup_check_hierarchy_limits(struct cgroup *parent) +{ + struct cgroup *cgroup; + int ret = false; + int level = 1; + + lockdep_assert_held(&cgroup_mutex); + + for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { + if (cgroup->nr_descendants >= cgroup->max_descendants) + goto fail; + + if (level > cgroup->max_depth) + goto fail; + + level++; + } + + ret = true; +fail: + return ret; +} + int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; @@ -4676,6 +4797,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) if (!parent) return -ENODEV; + if (!cgroup_check_hierarchy_limits(parent)) { + ret = -EAGAIN; + goto out_unlock; + } + cgrp = cgroup_create(parent); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); -- cgit v1.2.3 From ec39225cca42c05ac36853d11d28f877fde5c42e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 2 Aug 2017 17:55:31 +0100 Subject: cgroup: add cgroup.stat interface with basic hierarchy stats A cgroup can consume resources even after being deleted by a user. For example, writing back dirty pages should be accounted and limited, despite the corresponding cgroup might contain no processes and being deleted by a user. In the current implementation a cgroup can remain in such "dying" state for an undefined amount of time. For instance, if a memory cgroup contains a pge, mlocked by a process belonging to an other cgroup. Although the lifecycle of a dying cgroup is out of user's control, it's important to have some insight of what's going on under the hood. In particular, it's handy to have a counter which will allow to detect css leaks. To solve this problem, add a cgroup.stat interface to the base cgroup control files with the following metrics: nr_descendants total number of visible descendant cgroups nr_dying_descendants total number of dying descendant cgroups Signed-off-by: Roman Gushchin Suggested-by: Tejun Heo Signed-off-by: Tejun Heo Cc: Zefan Li Cc: Waiman Long Cc: Johannes Weiner Cc: kernel-team@fb.com Cc: cgroups@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- kernel/cgroup/cgroup.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0fd9134e1720..a06755a610e1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3304,6 +3304,18 @@ static int cgroup_events_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_stats_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgroup = seq_css(seq)->cgroup; + + seq_printf(seq, "nr_descendants %d\n", + cgroup->nr_descendants); + seq_printf(seq, "nr_dying_descendants %d\n", + cgroup->nr_dying_descendants); + + return 0; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4407,6 +4419,10 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_max_depth_show, .write = cgroup_max_depth_write, }, + { + .name = "cgroup.stat", + .seq_show = cgroup_stats_show, + }, { } /* terminate */ }; -- cgit v1.2.3 From 5a621e6c958e057c727a30c502f28bf2bc04adfd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 2 Aug 2017 17:55:32 +0100 Subject: cgroup: re-use the parent pointer in cgroup_destroy_locked() As we already have a pointer to the parent cgroup in cgroup_destroy_locked(), we don't need to calculate it again to pass as an argument for cgroup1_check_for_release(). Signed-off-by: Roman Gushchin Suggested-by: Tejun Heo Signed-off-by: Tejun Heo Cc: Zefan Li Cc: Waiman Long Cc: Johannes Weiner Cc: kernel-team@fb.com Cc: linux-kernel@vger.kernel.org --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a06755a610e1..92e599796220 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5022,7 +5022,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) tcgrp->nr_dying_descendants++; } - cgroup1_check_for_release(cgroup_parent(cgrp)); + cgroup1_check_for_release(parent); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); -- cgit v1.2.3 From 13d82fb77abb9625f3ca74f5c4cbedde0f412f01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Aug 2017 15:39:38 -0700 Subject: cgroup: short-circuit cset_cgroup_from_root() on the default hierarchy Each css_set directly points to the default cgroup it belongs to, so there's no reason to walk the cgrp_links list on the default hierarchy. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 92e599796220..f5ca55db1fe1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1311,6 +1311,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, if (cset == &init_css_set) { res = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; -- cgit v1.2.3 From 27e37d84e55f47bf28f7072fadf8fa2cbc3d9375 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 2 Aug 2017 13:31:50 -0700 Subject: pid: kill pidhash_size in pidhash_init() After commit 3d375d78593c ("mm: update callers to use HASH_ZERO flag"), drop unused pidhash_size in pidhash_init(). Link: http://lkml.kernel.org/r/1500389267-49222-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 731c4e528f4e..c69c30d827e5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -575,13 +575,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int pidhash_size; - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); - pidhash_size = 1U << pidhash_shift; } void __init pidmap_init(void) -- cgit v1.2.3 From 89affbf5d9ebb15c6460596822e8857ea2f9e735 Mon Sep 17 00:00:00 2001 From: Dima Zavin Date: Wed, 2 Aug 2017 13:32:18 -0700 Subject: cpuset: fix a deadlock due to incomplete patching of cpusets_enabled() In codepaths that use the begin/retry interface for reading mems_allowed_seq with irqs disabled, there exists a race condition that stalls the patch process after only modifying a subset of the static_branch call sites. This problem manifested itself as a deadlock in the slub allocator, inside get_any_partial. The loop reads mems_allowed_seq value (via read_mems_allowed_begin), performs the defrag operation, and then verifies the consistency of mem_allowed via the read_mems_allowed_retry and the cookie returned by xxx_begin. The issue here is that both begin and retry first check if cpusets are enabled via cpusets_enabled() static branch. This branch can be rewritted dynamically (via cpuset_inc) if a new cpuset is created. The x86 jump label code fully synchronizes across all CPUs for every entry it rewrites. If it rewrites only one of the callsites (specifically the one in read_mems_allowed_retry) and then waits for the smp_call_function(do_sync_core) to complete while a CPU is inside the begin/retry section with IRQs off and the mems_allowed value is changed, we can hang. This is because begin() will always return 0 (since it wasn't patched yet) while retry() will test the 0 against the actual value of the seq counter. The fix is to use two different static keys: one for begin (pre_enable_key) and one for retry (enable_key). In cpuset_inc(), we first bump the pre_enable key to ensure that cpuset_mems_allowed_begin() always return a valid seqcount if are enabling cpusets. Similarly, when disabling cpusets via cpuset_dec(), we first ensure that callers of cpuset_mems_allowed_retry() will start ignoring the seqcount value before we let cpuset_mems_allowed_begin() return 0. The relevant stack traces of the two stuck threads: CPU: 1 PID: 1415 Comm: mkdir Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8817f9c28000 task.stack: ffffc9000ffa4000 RIP: smp_call_function_many+0x1f9/0x260 Call Trace: smp_call_function+0x3b/0x70 on_each_cpu+0x2f/0x90 text_poke_bp+0x87/0xd0 arch_jump_label_transform+0x93/0x100 __jump_label_update+0x77/0x90 jump_label_update+0xaa/0xc0 static_key_slow_inc+0x9e/0xb0 cpuset_css_online+0x70/0x2e0 online_css+0x2c/0xa0 cgroup_apply_control_enable+0x27f/0x3d0 cgroup_mkdir+0x2b7/0x420 kernfs_iop_mkdir+0x5a/0x80 vfs_mkdir+0xf6/0x1a0 SyS_mkdir+0xb7/0xe0 entry_SYSCALL_64_fastpath+0x18/0xad ... CPU: 2 PID: 1 Comm: init Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8818087c0000 task.stack: ffffc90000030000 RIP: int3+0x39/0x70 Call Trace: <#DB> ? ___slab_alloc+0x28b/0x5a0 ? copy_process.part.40+0xf7/0x1de0 __slab_alloc.isra.80+0x54/0x90 copy_process.part.40+0xf7/0x1de0 copy_process.part.40+0xf7/0x1de0 kmem_cache_alloc_node+0x8a/0x280 copy_process.part.40+0xf7/0x1de0 _do_fork+0xe7/0x6c0 _raw_spin_unlock_irq+0x2d/0x60 trace_hardirqs_on_caller+0x136/0x1d0 entry_SYSCALL_64_fastpath+0x5/0xad do_syscall_64+0x27/0x350 SyS_clone+0x19/0x20 do_syscall_64+0x60/0x350 entry_SYSCALL64_slow_path+0x25/0x25 Link: http://lkml.kernel.org/r/20170731040113.14197-1-dmitriyz@waymo.com Fixes: 46e700abc44c ("mm, page_alloc: remove unnecessary taking of a seqlock when cpusets are disabled") Signed-off-by: Dima Zavin Reported-by: Cliff Spradlin Acked-by: Vlastimil Babka Cc: Peter Zijlstra Cc: Christopher Lameter Cc: Li Zefan Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cpuset.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca8376e5008c..8d5151688504 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -63,6 +63,7 @@ #include #include +DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ -- cgit v1.2.3 From e870c6c87cf9484090d28f2a68aa29e008960c93 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 31 Jul 2017 23:43:18 +0200 Subject: ACPI / PM: Prefer suspend-to-idle over S3 on some systems Modify the ACPI system sleep support setup code to select suspend-to-idle as the default system sleep state if (1) the ACPI_FADT_LOW_POWER_S0 flag is set in the FADT and (2) the Low Power Idle S0 _DSM interface has been discovered and (3) the default sleep state was not selected from the kernel command line. The main motivation for this change is that systems where the (1) and (2) conditions are met typically ship with OSes that don't exercise the S3 path in the platform firmware which remains untested and turns out to be non-functional at least in some cases. Signed-off-by: Rafael J. Wysocki Tested-by: Mario Limonciello --- kernel/power/power.h | 1 - kernel/power/suspend.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 268c1b0afc28..1d2d761e3c25 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -192,7 +192,6 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); extern const char * const pm_labels[]; extern const char *pm_states[]; extern const char *mem_sleep_states[]; -extern suspend_state_t mem_sleep_current; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4bce46ddc2cd..0639d3a79852 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -48,7 +48,7 @@ static const char * const mem_sleep_labels[] = { const char *mem_sleep_states[PM_SUSPEND_MAX]; suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; -static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM; +suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; suspend_state_t pm_suspend_target_state; EXPORT_SYMBOL_GPL(pm_suspend_target_state); @@ -216,7 +216,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) } if (valid_state(PM_SUSPEND_MEM)) { mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; - if (mem_sleep_default == PM_SUSPEND_MEM) + if (mem_sleep_default >= PM_SUSPEND_MEM) mem_sleep_current = PM_SUSPEND_MEM; } -- cgit v1.2.3 From fbb77611e95d3d5b2af86a59754a3130877cb667 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Sat, 5 Aug 2017 23:00:50 +0300 Subject: Fix compat_sys_sigpending breakage The latest change of compat_sys_sigpending in commit 8f13621abced ("sigpending(): move compat to native") has broken it in two ways. First, it tries to write 4 bytes more than userspace expects: sizeof(old_sigset_t) == sizeof(long) == 8 instead of sizeof(compat_old_sigset_t) == sizeof(u32) == 4. Second, on big endian architectures these bytes are being written in the wrong order. This bug was found by strace test suite. Reported-by: Anatoly Pugachev Inspired-by: Eugene Syromyatnikov Fixes: 8f13621abced ("sigpending(): move compat to native") Signed-off-by: Dmitry V. Levin Acked-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/signal.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index caed9133ae52..7e33f8c583e6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3303,12 +3303,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { +#ifdef __BIG_ENDIAN sigset_t set; - int err = do_sigpending(&set, sizeof(old_sigset_t)); - if (err == 0) - if (copy_to_user(set32, &set, sizeof(old_sigset_t))) - err = -EFAULT; + int err = do_sigpending(&set, sizeof(set.sig[0])); + if (!err) + err = put_user(set.sig[0], set32); return err; +#else + return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32)); +#endif } #endif -- cgit v1.2.3 From 9a2614916ac564d6ea1d0a5cb986298bc508c3bf Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Sun, 6 Aug 2017 19:33:22 -0700 Subject: workqueue: fix path to documentation Signed-off-by: Benjamin Peterson Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a86688fabc55..4fa6c7650f09 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -21,7 +21,7 @@ * pools for workqueues which are not bound to any specific CPU - the * number of these backing pools is dynamic. * - * Please read Documentation/workqueue.txt for details. + * Please read Documentation/core-api/workqueue.rst for details. */ #include -- cgit v1.2.3 From cf5f5cea270655dd49370760576c64b228583b79 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 4 Aug 2017 16:00:09 -0700 Subject: bpf: add support for sys_enter_* and sys_exit_* tracepoints Currently, bpf programs cannot be attached to sys_enter_* and sys_exit_* style tracepoints. The iovisor/bcc issue #748 (https://github.com/iovisor/bcc/issues/748) documents this issue. For example, if you try to attach a bpf program to tracepoints syscalls/sys_enter_newfstat, you will get the following error: # ./tools/trace.py t:syscalls:sys_enter_newfstat Ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument Failed to attach BPF to tracepoint The main reason is that syscalls/sys_enter_* and syscalls/sys_exit_* tracepoints are treated differently from other tracepoints and there is no bpf hook to it. This patch adds bpf support for these syscalls tracepoints by . permitting bpf attachment in ioctl PERF_EVENT_IOC_SET_BPF . calling bpf programs in perf_syscall_enter and perf_syscall_exit The legality of bpf program ctx access is also checked. Function trace_event_get_offsets returns correct max offset for each specific syscall tracepoint, which is compared against the maximum offset access in bpf program. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/events/core.c | 10 ++++---- kernel/trace/trace_syscalls.c | 53 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..a7a6c1d19a49 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8050,7 +8050,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { - bool is_kprobe, is_tracepoint; + bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; if (event->attr.type != PERF_TYPE_TRACEPOINT) @@ -8061,7 +8061,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; - if (!is_kprobe && !is_tracepoint) + is_syscall_tp = is_syscall_trace_event(event->tp_event); + if (!is_kprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; @@ -8070,13 +8071,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return PTR_ERR(prog); if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || - (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || + (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; } - if (is_tracepoint) { + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); if (prog->aux->max_ctx_offset > off) { diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e10395da88e..7a1a92036563 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; +static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, + struct syscall_metadata *sys_data, + struct syscall_trace_enter *rec) { + struct syscall_tp_t { + unsigned long long regs; + unsigned long syscall_nr; + unsigned long args[sys_data->nb_args]; + } param; + int i; + + *(struct pt_regs **)¶m = regs; + param.syscall_nr = rec->nr; + for (i = 0; i < sys_data->nb_args; i++) + param.args[i] = rec->args[i]; + return trace_call_bpf(prog, ¶m); +} + static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; + struct bpf_prog *prog; int syscall_nr; int rctx; int size; @@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + prog = READ_ONCE(sys_data->enter_event->prog); head = this_cpu_ptr(sys_data->enter_event->perf_events); - if (hlist_empty(head)) + if (!prog && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ @@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); + + if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) || + hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL); @@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call) mutex_unlock(&syscall_trace_lock); } +static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, + struct syscall_trace_exit *rec) { + struct syscall_tp_t { + unsigned long long regs; + unsigned long syscall_nr; + unsigned long ret; + } param; + + *(struct pt_regs **)¶m = regs; + param.syscall_nr = rec->nr; + param.ret = rec->ret; + return trace_call_bpf(prog, ¶m); +} + static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; struct hlist_head *head; + struct bpf_prog *prog; int syscall_nr; int rctx; int size; @@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; + prog = READ_ONCE(sys_data->exit_event->prog); head = this_cpu_ptr(sys_data->exit_event->perf_events); - if (hlist_empty(head)) + if (!prog && hlist_empty(head)) return; /* We can probably do that at build time */ @@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); + + if ((prog && !perf_call_bpf_exit(prog, regs, rec)) || + hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL); } -- cgit v1.2.3 From 4cc7b9544b9a904add353406ed1bacbf56f75c52 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 4 Aug 2017 22:02:19 -0700 Subject: bpf: devmap fix mutex in rcu critical section Originally we used a mutex to protect concurrent devmap update and delete operations from racing with netdev unregister notifier callbacks. The notifier hook is needed because we increment the netdev ref count when a dev is added to the devmap. This ensures the netdev reference is valid in the datapath. However, we don't want to block unregister events, hence the initial mutex and notifier handler. The concern was in the notifier hook we search the map for dev entries that hold a refcnt on the net device being torn down. But, in order to do this we require two steps, (i) dereference the netdev: dev = rcu_dereference(map[i]) (ii) test ifindex: dev->ifindex == removing_ifindex and then finally we can swap in the NULL dev in the map via an xchg operation, xchg(map[i], NULL) The danger here is a concurrent update could run a different xchg op concurrently leading us to replace the new dev with a NULL dev incorrectly. CPU 1 CPU 2 notifier hook bpf devmap update dev = rcu_dereference(map[i]) dev = rcu_dereference(map[i]) xchg(map[i]), new_dev); rcu_call(dev,...) xchg(map[i], NULL) The above flow would create the incorrect state with the dev reference in the update path being lost. To resolve this the original code used a mutex around the above block. However, updates, deletes, and lookups occur inside rcu critical sections so we can't use a mutex in this context safely. Fortunately, by writing slightly better code we can avoid the mutex altogether. If CPU 1 in the above example uses a cmpxchg and _only_ replaces the dev reference in the map when it is in fact the expected dev the race is removed completely. The two cases being illustrated here, first the race condition, CPU 1 CPU 2 notifier hook bpf devmap update dev = rcu_dereference(map[i]) dev = rcu_dereference(map[i]) xchg(map[i]), new_dev); rcu_call(dev,...) odev = cmpxchg(map[i], dev, NULL) Now we can test the cmpxchg return value, detect odev != dev and abort. Or in the good case, CPU 1 CPU 2 notifier hook bpf devmap update dev = rcu_dereference(map[i]) odev = cmpxchg(map[i], dev, NULL) [...] Now 'odev == dev' and we can do proper cleanup. And viola the original race we tried to solve with a mutex is corrected and the trace noted by Sasha below is resolved due to removal of the mutex. Note: When walking the devmap and removing dev references as needed we depend on the core to fail any calls to dev_get_by_index() using the ifindex of the device being removed. This way we do not race with the user while searching the devmap. Additionally, the mutex was also protecting list add/del/read on the list of maps in-use. This patch converts this to an RCU list and spinlock implementation. This protects the list from concurrent alloc/free operations. The notifier hook walks this list so it uses RCU read semantics. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:747 in_atomic(): 1, irqs_disabled(): 0, pid: 16315, name: syz-executor1 1 lock held by syz-executor1/16315: #0: (rcu_read_lock){......}, at: [] map_delete_elem kernel/bpf/syscall.c:577 [inline] #0: (rcu_read_lock){......}, at: [] SYSC_bpf kernel/bpf/syscall.c:1427 [inline] #0: (rcu_read_lock){......}, at: [] SyS_bpf+0x1d32/0x4ba0 kernel/bpf/syscall.c:1388 Fixes: 2ddf71e23cc2 ("net: add notifier hooks for devmap bpf map") Reported-by: Sasha Levin Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d439ee0eadb1..7192fb67d4de 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -40,11 +40,12 @@ * contain a reference to the net device and remove them. This is a two step * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) * check to see if the ifindex is the same as the net_device being removed. - * Unfortunately, the xchg() operations do not protect against this. To avoid - * potentially removing incorrect objects the dev_map_list_mutex protects - * conflicting netdev unregister and BPF syscall operations. Updates and - * deletes from a BPF program (done in rcu critical section) are blocked - * because of this mutex. + * When removing the dev a cmpxchg() is used to ensure the correct dev is + * removed, in the case of a concurrent update or delete operation it is + * possible that the initially referenced dev is no longer in the map. As the + * notifier hook walks the map we know that new dev references can not be + * added by the user because core infrastructure ensures dev_get_by_index() + * calls will fail at this point. */ #include #include @@ -68,7 +69,7 @@ struct bpf_dtab { struct list_head list; }; -static DEFINE_MUTEX(dev_map_list_mutex); +static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); static struct bpf_map *dev_map_alloc(union bpf_attr *attr) @@ -128,9 +129,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_dtab; - mutex_lock(&dev_map_list_mutex); - list_add_tail(&dtab->list, &dev_map_list); - mutex_unlock(&dev_map_list_mutex); + spin_lock(&dev_map_lock); + list_add_tail_rcu(&dtab->list, &dev_map_list); + spin_unlock(&dev_map_lock); return &dtab->map; free_dtab: @@ -169,7 +170,6 @@ static void dev_map_free(struct bpf_map *map) * at this point we we can still race with netdev notifier, hence the * lock. */ - mutex_lock(&dev_map_list_mutex); for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -184,8 +184,9 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ - list_del(&dtab->list); - mutex_unlock(&dev_map_list_mutex); + spin_lock(&dev_map_lock); + list_del_rcu(&dtab->list); + spin_unlock(&dev_map_lock); free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); @@ -322,11 +323,9 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) * the driver tear down ensures all soft irqs are complete before * removing the net device in the case of dev_put equals zero. */ - mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[k], NULL); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); - mutex_unlock(&dev_map_list_mutex); return 0; } @@ -369,11 +368,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, * Remembering the driver side flush operation will happen before the * net device is removed. */ - mutex_lock(&dev_map_list_mutex); old_dev = xchg(&dtab->netdev_map[i], dev); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); - mutex_unlock(&dev_map_list_mutex); return 0; } @@ -396,22 +393,27 @@ static int dev_map_notification(struct notifier_block *notifier, switch (event) { case NETDEV_UNREGISTER: - mutex_lock(&dev_map_list_mutex); - list_for_each_entry(dtab, &dev_map_list, list) { + /* This rcu_read_lock/unlock pair is needed because + * dev_map_list is an RCU list AND to ensure a delete + * operation does not free a netdev_map entry while we + * are comparing it against the netdev being unregistered. + */ + rcu_read_lock(); + list_for_each_entry_rcu(dtab, &dev_map_list, list) { for (i = 0; i < dtab->map.max_entries; i++) { - struct bpf_dtab_netdev *dev; + struct bpf_dtab_netdev *dev, *odev; - dev = dtab->netdev_map[i]; + dev = READ_ONCE(dtab->netdev_map[i]); if (!dev || dev->dev->ifindex != netdev->ifindex) continue; - dev = xchg(&dtab->netdev_map[i], NULL); - if (dev) + odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); + if (dev == odev) call_rcu(&dev->rcu, __dev_map_entry_free); } } - mutex_unlock(&dev_map_list_mutex); + rcu_read_unlock(); break; default: break; -- cgit v1.2.3 From f1174f77b50c94eecaa658fdc56fa69b421de4b8 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 7 Aug 2017 15:26:19 +0100 Subject: bpf/verifier: rework value tracking Unifies adjusted and unadjusted register value types (e.g. FRAME_POINTER is now just a PTR_TO_STACK with zero offset). Tracks value alignment by means of tracking known & unknown bits. This also replaces the 'reg->imm' (leading zero bits) calculations for (what were) UNKNOWN_VALUEs. If pointer leaks are allowed, and adjust_ptr_min_max_vals returns -EACCES, treat the pointer as an unknown scalar and try again, because we might be able to conclude something about the result (e.g. pointer & 0x40 is either 0 or 0x40). Verifier hooks in the netronome/nfp driver were changed to match the new data structures. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 2 +- kernel/bpf/tnum.c | 164 +++++ kernel/bpf/verifier.c | 1780 +++++++++++++++++++++++++++---------------------- 3 files changed, 1139 insertions(+), 807 deletions(-) create mode 100644 kernel/bpf/tnum.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 48e92705be59..2f0bcda40e90 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,6 +1,6 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c new file mode 100644 index 000000000000..92eeeb1974a2 --- /dev/null +++ b/kernel/bpf/tnum.c @@ -0,0 +1,164 @@ +/* tnum: tracked (or tristate) numbers + * + * A tnum tracks knowledge about the bits of a value. Each bit can be either + * known (0 or 1), or unknown (x). Arithmetic operations on tnums will + * propagate the unknown bits such that the tnum result represents all the + * possible results for possible values of the operands. + */ +#include +#include + +#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} +/* A completely unknown value */ +const struct tnum tnum_unknown = { .value = 0, .mask = -1 }; + +struct tnum tnum_const(u64 value) +{ + return TNUM(value, 0); +} + +struct tnum tnum_lshift(struct tnum a, u8 shift) +{ + return TNUM(a.value << shift, a.mask << shift); +} + +struct tnum tnum_rshift(struct tnum a, u8 shift) +{ + return TNUM(a.value >> shift, a.mask >> shift); +} + +struct tnum tnum_add(struct tnum a, struct tnum b) +{ + u64 sm, sv, sigma, chi, mu; + + sm = a.mask + b.mask; + sv = a.value + b.value; + sigma = sm + sv; + chi = sigma ^ sv; + mu = chi | a.mask | b.mask; + return TNUM(sv & ~mu, mu); +} + +struct tnum tnum_sub(struct tnum a, struct tnum b) +{ + u64 dv, alpha, beta, chi, mu; + + dv = a.value - b.value; + alpha = dv + a.mask; + beta = dv - b.mask; + chi = alpha ^ beta; + mu = chi | a.mask | b.mask; + return TNUM(dv & ~mu, mu); +} + +struct tnum tnum_and(struct tnum a, struct tnum b) +{ + u64 alpha, beta, v; + + alpha = a.value | a.mask; + beta = b.value | b.mask; + v = a.value & b.value; + return TNUM(v, alpha & beta & ~v); +} + +struct tnum tnum_or(struct tnum a, struct tnum b) +{ + u64 v, mu; + + v = a.value | b.value; + mu = a.mask | b.mask; + return TNUM(v, mu & ~v); +} + +struct tnum tnum_xor(struct tnum a, struct tnum b) +{ + u64 v, mu; + + v = a.value ^ b.value; + mu = a.mask | b.mask; + return TNUM(v & ~mu, mu); +} + +/* half-multiply add: acc += (unknown * mask * value). + * An intermediate step in the multiply algorithm. + */ +static struct tnum hma(struct tnum acc, u64 value, u64 mask) +{ + while (mask) { + if (mask & 1) + acc = tnum_add(acc, TNUM(0, value)); + mask >>= 1; + value <<= 1; + } + return acc; +} + +struct tnum tnum_mul(struct tnum a, struct tnum b) +{ + struct tnum acc; + u64 pi; + + pi = a.value * b.value; + acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value); + return hma(acc, b.mask, a.value); +} + +/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has + * a 'known 0' - this will return a 'known 1' for that bit. + */ +struct tnum tnum_intersect(struct tnum a, struct tnum b) +{ + u64 v, mu; + + v = a.value | b.value; + mu = a.mask & b.mask; + return TNUM(v & ~mu, mu); +} + +struct tnum tnum_cast(struct tnum a, u8 size) +{ + a.value &= (1ULL << (size * 8)) - 1; + a.mask &= (1ULL << (size * 8)) - 1; + return a; +} + +bool tnum_is_aligned(struct tnum a, u64 size) +{ + if (!size) + return true; + return !((a.value | a.mask) & (size - 1)); +} + +bool tnum_in(struct tnum a, struct tnum b) +{ + if (b.mask & ~a.mask) + return false; + b.value &= ~a.mask; + return a.value == b.value; +} + +int tnum_strn(char *str, size_t size, struct tnum a) +{ + return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); +} +EXPORT_SYMBOL_GPL(tnum_strn); + +int tnum_sbin(char *str, size_t size, struct tnum a) +{ + size_t n; + + for (n = 64; n; n--) { + if (n < size) { + if (a.mask & 1) + str[n - 1] = 'x'; + else if (a.value & 1) + str[n - 1] = '1'; + else + str[n - 1] = '0'; + } + a.mask >>= 1; + a.value >>= 1; + } + str[min(size - 1, (size_t)64)] = 0; + return 64; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6e8b3887eab..c3f88b466c30 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -61,12 +61,12 @@ * (and -20 constant is saved for further stack bounds checking). * Meaning that this reg is a pointer to stack plus known immediate constant. * - * Most of the time the registers have UNKNOWN_VALUE type, which + * Most of the time the registers have SCALAR_VALUE type, which * means the register has some value, but it's not a valid pointer. - * (like pointer plus pointer becomes UNKNOWN_VALUE type) + * (like pointer plus pointer becomes SCALAR_VALUE type) * * When verifier sees load or store instructions the type of base register - * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer * types recognized by check_mem_access() function. * * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' @@ -180,15 +180,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...) /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", - [UNKNOWN_VALUE] = "inv", + [SCALAR_VALUE] = "inv", [PTR_TO_CTX] = "ctx", [CONST_PTR_TO_MAP] = "map_ptr", [PTR_TO_MAP_VALUE] = "map_value", [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", - [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj", - [FRAME_PTR] = "fp", [PTR_TO_STACK] = "fp", - [CONST_IMM] = "imm", [PTR_TO_PACKET] = "pkt", [PTR_TO_PACKET_END] = "pkt_end", }; @@ -221,32 +218,36 @@ static void print_verifier_state(struct bpf_verifier_state *state) if (t == NOT_INIT) continue; verbose(" R%d=%s", i, reg_type_str[t]); - if (t == CONST_IMM || t == PTR_TO_STACK) - verbose("%lld", reg->imm); - else if (t == PTR_TO_PACKET) - verbose("(id=%d,off=%d,r=%d)", - reg->id, reg->off, reg->range); - else if (t == UNKNOWN_VALUE && reg->imm) - verbose("%lld", reg->imm); - else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL || - t == PTR_TO_MAP_VALUE_ADJ) - verbose("(ks=%d,vs=%d,id=%u)", - reg->map_ptr->key_size, - reg->map_ptr->value_size, - reg->id); - if (reg->min_value != BPF_REGISTER_MIN_RANGE) - verbose(",min_value=%lld", - (long long)reg->min_value); - if (reg->max_value != BPF_REGISTER_MAX_RANGE) - verbose(",max_value=%llu", - (unsigned long long)reg->max_value); - if (reg->min_align) - verbose(",min_align=%u", reg->min_align); - if (reg->aux_off) - verbose(",aux_off=%u", reg->aux_off); - if (reg->aux_off_align) - verbose(",aux_off_align=%u", reg->aux_off_align); + if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && + tnum_is_const(reg->var_off)) { + /* reg->off should be 0 for SCALAR_VALUE */ + verbose("%lld", reg->var_off.value + reg->off); + } else { + verbose("(id=%d", reg->id); + if (t != SCALAR_VALUE) + verbose(",off=%d", reg->off); + if (t == PTR_TO_PACKET) + verbose(",r=%d", reg->range); + else if (t == CONST_PTR_TO_MAP || + t == PTR_TO_MAP_VALUE || + t == PTR_TO_MAP_VALUE_OR_NULL) + verbose(",ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); + if (reg->min_value != BPF_REGISTER_MIN_RANGE) + verbose(",min_value=%lld", + (long long)reg->min_value); + if (reg->max_value != BPF_REGISTER_MAX_RANGE) + verbose(",max_value=%llu", + (unsigned long long)reg->max_value); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(",var_off=%s", tn_buf); + } + verbose(")"); + } } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) @@ -463,14 +464,69 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +static void __mark_reg_not_init(struct bpf_reg_state *reg); + +/* Mark the 'variable offset' part of a register as zero. This should be + * used only on registers holding a pointer type. + */ +static void __mark_reg_known_zero(struct bpf_reg_state *reg) { - BUG_ON(regno >= MAX_BPF_REG); + reg->var_off = tnum_const(0); + reg->min_value = 0; + reg->max_value = 0; +} - memset(®s[regno], 0, sizeof(regs[regno])); - regs[regno].type = NOT_INIT; - regs[regno].min_value = BPF_REGISTER_MIN_RANGE; - regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) +{ + if (WARN_ON(regno >= MAX_BPF_REG)) { + verbose("mark_reg_known_zero(regs, %u)\n", regno); + /* Something bad happened, let's kill all regs */ + for (regno = 0; regno < MAX_BPF_REG; regno++) + __mark_reg_not_init(regs + regno); + return; + } + __mark_reg_known_zero(regs + regno); +} + +/* Mark a register as having a completely unknown (scalar) value. */ +static void __mark_reg_unknown(struct bpf_reg_state *reg) +{ + reg->type = SCALAR_VALUE; + reg->id = 0; + reg->off = 0; + reg->var_off = tnum_unknown; + reg->min_value = BPF_REGISTER_MIN_RANGE; + reg->max_value = BPF_REGISTER_MAX_RANGE; +} + +static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) +{ + if (WARN_ON(regno >= MAX_BPF_REG)) { + verbose("mark_reg_unknown(regs, %u)\n", regno); + /* Something bad happened, let's kill all regs */ + for (regno = 0; regno < MAX_BPF_REG; regno++) + __mark_reg_not_init(regs + regno); + return; + } + __mark_reg_unknown(regs + regno); +} + +static void __mark_reg_not_init(struct bpf_reg_state *reg) +{ + __mark_reg_unknown(reg); + reg->type = NOT_INIT; +} + +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +{ + if (WARN_ON(regno >= MAX_BPF_REG)) { + verbose("mark_reg_not_init(regs, %u)\n", regno); + /* Something bad happened, let's kill all regs */ + for (regno = 0; regno < MAX_BPF_REG; regno++) + __mark_reg_not_init(regs + regno); + return; + } + __mark_reg_not_init(regs + regno); } static void init_reg_state(struct bpf_reg_state *regs) @@ -481,23 +537,12 @@ static void init_reg_state(struct bpf_reg_state *regs) mark_reg_not_init(regs, i); /* frame pointer */ - regs[BPF_REG_FP].type = FRAME_PTR; + regs[BPF_REG_FP].type = PTR_TO_STACK; + mark_reg_known_zero(regs, BPF_REG_FP); /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; -} - -static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) -{ - regs[regno].type = UNKNOWN_VALUE; - regs[regno].id = 0; - regs[regno].imm = 0; -} - -static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) -{ - BUG_ON(regno >= MAX_BPF_REG); - __mark_reg_unknown_value(regs, regno); + mark_reg_known_zero(regs, BPF_REG_1); } static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) @@ -505,14 +550,6 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; regs[regno].value_from_signed = false; - regs[regno].min_align = 0; -} - -static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, - u32 regno) -{ - mark_reg_unknown_value(regs, regno); - reset_reg_range_values(regs, regno); } enum reg_arg_type { @@ -542,7 +579,7 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, return -EACCES; } if (t == DST_OP) - mark_reg_unknown_value(regs, regno); + mark_reg_unknown(regs, regno); } return 0; } @@ -552,12 +589,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type) switch (type) { case PTR_TO_MAP_VALUE: case PTR_TO_MAP_VALUE_OR_NULL: - case PTR_TO_MAP_VALUE_ADJ: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: case PTR_TO_PACKET_END: - case FRAME_PTR: case CONST_PTR_TO_MAP: return true; default: @@ -637,14 +672,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown_value_and_range(state->regs, - value_regno); + mark_reg_unknown(state->regs, value_regno); return 0; } } /* check read/write into map element returned by bpf_map_lookup_elem() */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, +static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; @@ -657,22 +691,25 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, return 0; } -/* check read/write into an adjusted map element */ -static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno, +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { struct bpf_verifier_state *state = &env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We adjusted the register to this map value, so we - * need to change off and size to min_value and max_value - * respectively to make sure our theoretical access will be - * safe. + /* We may have adjusted the register to this map value, so we + * need to try adding each of min_value and max_value to off + * to make sure our theoretical access will be safe. */ if (log_level) print_verifier_state(state); - env->varlen_map_value_access = true; + /* If the offset is variable, we will need to be stricter in state + * pruning from now on. + */ + if (!tnum_is_const(reg->var_off)) + env->varlen_map_value_access = true; /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our @@ -684,10 +721,9 @@ static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = check_map_access(env, regno, reg->min_value + off, size); + err = __check_map_access(env, regno, reg->min_value + off, size); if (err) { - verbose("R%d min value is outside of the array range\n", - regno); + verbose("R%d min value is outside of the array range\n", regno); return err; } @@ -699,7 +735,10 @@ static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - return check_map_access(env, regno, reg->max_value + off, size); + err = __check_map_access(env, regno, reg->max_value + off, size); + if (err) + verbose("R%d max value is outside of the array range\n", regno); + return err; } #define MAX_PACKET_OFF 0xffff @@ -729,14 +768,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, - int size) +static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, + int off, int size) { struct bpf_reg_state *regs = env->cur_state.regs; struct bpf_reg_state *reg = ®s[regno]; - off += reg->off; - if (off < 0 || size <= 0 || off + size > reg->range) { + if (off < 0 || size <= 0 || (u64)off + size > reg->range) { verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; @@ -744,7 +782,35 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, return 0; } -/* check access to 'struct bpf_context' fields */ +static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, + int size) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *reg = ®s[regno]; + int err; + + /* We may have added a variable offset to the packet pointer; but any + * reg->range we have comes after that. We are only checking the fixed + * offset. + */ + + /* We don't allow negative numbers, because we aren't tracking enough + * detail to prove they're safe. + */ + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = __check_packet_access(env, regno, off, size); + if (err) { + verbose("R%d offset is outside of the packet\n", regno); + return err; + } + return err; +} + +/* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { @@ -784,13 +850,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks, if (allow_ptr_leaks) return false; - switch (reg->type) { - case UNKNOWN_VALUE: - case CONST_IMM: - return false; - default: - return true; - } + return reg->type != SCALAR_VALUE; } static bool is_pointer_value(struct bpf_verifier_env *env, int regno) @@ -801,23 +861,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { + struct tnum reg_off; int ip_align; - int reg_off; /* Byte size accesses are always allowed. */ if (!strict || size == 1) return 0; - reg_off = reg->off; - if (reg->id) { - if (reg->aux_off_align % size) { - verbose("Packet access is only %u byte aligned, %d byte access not allowed\n", - reg->aux_off_align, size); - return -EACCES; - } - reg_off += reg->aux_off; - } - /* For platforms that do not have a Kconfig enabling * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of * NET_IP_ALIGN is universally set to '2'. And on platforms @@ -827,20 +877,37 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, * unconditional IP align value of '2'. */ ip_align = 2; - if ((ip_align + reg_off + off) % size != 0) { - verbose("misaligned packet access off %d+%d+%d size %d\n", - ip_align, reg_off, off, size); + + reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off)); + if (!tnum_is_aligned(reg_off, size)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("misaligned packet access off %d+%s+%d+%d size %d\n", + ip_align, tn_buf, reg->off, off, size); return -EACCES; } return 0; } -static int check_val_ptr_alignment(const struct bpf_reg_state *reg, - int size, bool strict) +static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, + const char *pointer_desc, + int off, int size, bool strict) { - if (strict && size != 1) { - verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); + struct tnum reg_off; + + /* Byte size accesses are always allowed. */ + if (!strict || size == 1) + return 0; + + reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off)); + if (!tnum_is_aligned(reg_off, size)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("misaligned %saccess off %s+%d+%d size %d\n", + pointer_desc, tn_buf, reg->off, off, size); return -EACCES; } @@ -852,21 +919,25 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, int off, int size) { bool strict = env->strict_alignment; + const char *pointer_desc = ""; switch (reg->type) { case PTR_TO_PACKET: + /* special case, because of NET_IP_ALIGN */ return check_pkt_ptr_alignment(reg, off, size, strict); - case PTR_TO_MAP_VALUE_ADJ: - return check_val_ptr_alignment(reg, size, strict); + case PTR_TO_MAP_VALUE: + pointer_desc = "value "; + break; + case PTR_TO_CTX: + pointer_desc = "context "; + break; + case PTR_TO_STACK: + pointer_desc = "stack "; + break; default: - if (off % size != 0) { - verbose("misaligned access off %d size %d\n", - off, size); - return -EACCES; - } - - return 0; + break; } + return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); } /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -883,52 +954,79 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn struct bpf_reg_state *reg = &state->regs[regno]; int size, err = 0; - if (reg->type == PTR_TO_STACK) - off += reg->imm; - size = bpf_size_to_bytes(bpf_size); if (size < 0) return size; + /* alignment checks will add in reg->off themselves */ err = check_ptr_alignment(env, reg, off, size); if (err) return err; - if (reg->type == PTR_TO_MAP_VALUE || - reg->type == PTR_TO_MAP_VALUE_ADJ) { + /* for access checks, reg->off is just part of off */ + off += reg->off; + + if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into map\n", value_regno); return -EACCES; } - if (reg->type == PTR_TO_MAP_VALUE_ADJ) - err = check_map_access_adj(env, regno, off, size); - else - err = check_map_access(env, regno, off, size); + err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value_and_range(state->regs, - value_regno); + mark_reg_unknown(state->regs, value_regno); } else if (reg->type == PTR_TO_CTX) { - enum bpf_reg_type reg_type = UNKNOWN_VALUE; + enum bpf_reg_type reg_type = SCALAR_VALUE; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } + /* ctx accesses must be at a fixed offset, so that we can + * determine what type of data were returned. + */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("variable ctx access var_off=%s off=%d size=%d", + tn_buf, off, size); + return -EACCES; + } + off += reg->var_off.value; err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { - mark_reg_unknown_value_and_range(state->regs, - value_regno); - /* note that reg.[id|off|range] == 0 */ + /* ctx access returns either a scalar, or a + * PTR_TO_PACKET[_END]. In the latter case, we know + * the offset is zero. + */ + if (reg_type == SCALAR_VALUE) + mark_reg_unknown(state->regs, value_regno); + else + mark_reg_known_zero(state->regs, value_regno); + state->regs[value_regno].id = 0; + state->regs[value_regno].off = 0; + state->regs[value_regno].range = 0; state->regs[value_regno].type = reg_type; - state->regs[value_regno].aux_off = 0; - state->regs[value_regno].aux_off_align = 0; } - } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { + } else if (reg->type == PTR_TO_STACK) { + /* stack accesses must be at a fixed offset, so that we can + * determine what type of data were returned. + * See check_stack_read(). + */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("variable stack access var_off=%s off=%d size=%d", + tn_buf, off, size); + return -EACCES; + } + off += reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK) { verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; @@ -948,7 +1046,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { err = check_stack_read(state, off, size, value_regno); } - } else if (state->regs[regno].type == PTR_TO_PACKET) { + } else if (reg->type == PTR_TO_PACKET) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; @@ -960,21 +1058,24 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value_and_range(state->regs, - value_regno); + mark_reg_unknown(state->regs, value_regno); } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); return -EACCES; } - if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks && - state->regs[value_regno].type == UNKNOWN_VALUE) { - /* 1 or 2 byte load zero-extends, determine the number of - * zero upper bits. Not doing it fo 4 byte load, since - * such values cannot be added to ptr_to_packet anyway. - */ - state->regs[value_regno].imm = 64 - size * 8; + if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && + state->regs[value_regno].type == SCALAR_VALUE) { + /* b/h/w load zero-extends, mark upper bits as known 0 */ + state->regs[value_regno].var_off = tnum_cast( + state->regs[value_regno].var_off, size); + /* sign bit is known zero, so we can bound the value */ + state->regs[value_regno].min_value = 0; + state->regs[value_regno].max_value = min_t(u64, + state->regs[value_regno].var_off.value | + state->regs[value_regno].var_off.mask, + BPF_REGISTER_MAX_RANGE); } return err; } @@ -1016,9 +1117,17 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1); } +/* Does this register contain a constant zero? */ +static bool register_is_null(struct bpf_reg_state reg) +{ + return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0); +} + /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary - * and all elements of stack are initialized + * and all elements of stack are initialized. + * Unlike most pointer bounds-checking functions, this one doesn't take an + * 'off' argument, so it has to add in reg->off itself. */ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, @@ -1029,9 +1138,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int off, i; if (regs[regno].type != PTR_TO_STACK) { + /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - regs[regno].type == CONST_IMM && - regs[regno].imm == 0) + register_is_null(regs[regno])) return 0; verbose("R%d type=%s expected=%s\n", regno, @@ -1040,7 +1149,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - off = regs[regno].imm; + /* Only allow fixed-offset stack reads */ + if (!tnum_is_const(regs[regno].var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); + verbose("invalid variable stack read R%d var_off=%s\n", + regno, tn_buf); + } + off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size <= 0) { verbose("invalid stack type R%d off=%d access_size=%d\n", @@ -1071,16 +1188,14 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; - switch (regs[regno].type) { + switch (reg->type) { case PTR_TO_PACKET: - return check_packet_access(env, regno, 0, access_size); + return check_packet_access(env, regno, reg->off, access_size); case PTR_TO_MAP_VALUE: - return check_map_access(env, regno, 0, access_size); - case PTR_TO_MAP_VALUE_ADJ: - return check_map_access_adj(env, regno, 0, access_size); - default: /* const_imm|ptr_to_stack or invalid ptr */ + return check_map_access(env, regno, reg->off, access_size); + default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); } @@ -1123,11 +1238,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { - expected_type = CONST_IMM; - /* One exception. Allow UNKNOWN_VALUE registers when the - * boundaries are known and don't cause unsafe memory accesses - */ - if (type != UNKNOWN_VALUE && type != expected_type) + expected_type = SCALAR_VALUE; + if (type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; @@ -1141,13 +1253,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be - * passed in as argument, it's a CONST_IMM type. Final test + * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (type == CONST_IMM && reg->imm == 0) + if (register_is_null(*reg)) /* final test in check_stack_boundary() */; else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && - type != PTR_TO_MAP_VALUE_ADJ && type != expected_type) + type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { @@ -1173,7 +1285,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return -EACCES; } if (type == PTR_TO_PACKET) - err = check_packet_access(env, regno, 0, + err = check_packet_access(env, regno, reg->off, meta->map_ptr->key_size); else err = check_stack_boundary(env, regno, @@ -1189,7 +1301,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return -EACCES; } if (type == PTR_TO_PACKET) - err = check_packet_access(env, regno, 0, + err = check_packet_access(env, regno, reg->off, meta->map_ptr->value_size); else err = check_stack_boundary(env, regno, @@ -1209,10 +1321,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return -EACCES; } - /* If the register is UNKNOWN_VALUE, the access check happens - * using its boundaries. Otherwise, just use its imm + /* The register is SCALAR_VALUE; the access check + * happens using its boundaries. */ - if (type == UNKNOWN_VALUE) { + + if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to * initialize all the memory that the helper could @@ -1220,35 +1333,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ meta = NULL; - if (reg->min_value < 0) { - verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); - return -EACCES; - } - - if (reg->min_value == 0) { - err = check_helper_mem_access(env, regno - 1, 0, - zero_size_allowed, - meta); - if (err) - return err; - } + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } - if (reg->max_value == BPF_REGISTER_MAX_RANGE) { - verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); - return -EACCES; - } - err = check_helper_mem_access(env, regno - 1, - reg->max_value, - zero_size_allowed, meta); + if (reg->min_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); if (err) return err; - } else { - /* register is CONST_IMM */ - err = check_helper_mem_access(env, regno - 1, reg->imm, - zero_size_allowed, meta); } + + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->max_value, + zero_size_allowed, meta); } return err; @@ -1352,6 +1458,9 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } +/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, + * so turn them into unknown SCALAR_VALUE. + */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { struct bpf_verifier_state *state = &env->cur_state; @@ -1361,7 +1470,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET || regs[i].type == PTR_TO_PACKET_END) - mark_reg_unknown_value(regs, i); + mark_reg_unknown(regs, i); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) @@ -1370,8 +1479,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_PACKET_END) continue; - __mark_reg_unknown_value(state->spilled_regs, - i / BPF_REG_SIZE); + __mark_reg_unknown(reg); } } @@ -1451,14 +1559,17 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* update return register */ if (fn->ret_type == RET_INTEGER) { - regs[BPF_REG_0].type = UNKNOWN_VALUE; + /* sets type to SCALAR_VALUE */ + mark_reg_unknown(regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { struct bpf_insn_aux_data *insn_aux; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; - regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; + /* There is no offset yet applied, variable or fixed */ + mark_reg_known_zero(regs, BPF_REG_0); + regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() @@ -1489,456 +1600,337 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return 0; } -static int check_packet_ptr_add(struct bpf_verifier_env *env, - struct bpf_insn *insn) +static void check_reg_overflow(struct bpf_reg_state *reg) { - struct bpf_reg_state *regs = env->cur_state.regs; - struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; - struct bpf_reg_state *src_reg = ®s[insn->src_reg]; - struct bpf_reg_state tmp_reg; - s32 imm; - - if (BPF_SRC(insn->code) == BPF_K) { - /* pkt_ptr += imm */ - imm = insn->imm; - -add_imm: - if (imm < 0) { - verbose("addition of negative constant to packet pointer is not allowed\n"); - return -EACCES; - } - if (imm >= MAX_PACKET_OFF || - imm + dst_reg->off >= MAX_PACKET_OFF) { - verbose("constant %d is too large to add to packet pointer\n", - imm); - return -EACCES; - } - /* a constant was added to pkt_ptr. - * Remember it while keeping the same 'id' - */ - dst_reg->off += imm; - } else { - bool had_id; - - if (src_reg->type == PTR_TO_PACKET) { - /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ - tmp_reg = *dst_reg; /* save r7 state */ - *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */ - src_reg = &tmp_reg; /* pretend it's src_reg state */ - /* if the checks below reject it, the copy won't matter, - * since we're rejecting the whole program. If all ok, - * then imm22 state will be added to r7 - * and r7 will be pkt(id=0,off=22,r=62) while - * r6 will stay as pkt(id=0,off=0,r=62) - */ - } - - if (src_reg->type == CONST_IMM) { - /* pkt_ptr += reg where reg is known constant */ - imm = src_reg->imm; - goto add_imm; - } - /* disallow pkt_ptr += reg - * if reg is not uknown_value with guaranteed zero upper bits - * otherwise pkt_ptr may overflow and addition will become - * subtraction which is not allowed - */ - if (src_reg->type != UNKNOWN_VALUE) { - verbose("cannot add '%s' to ptr_to_packet\n", - reg_type_str[src_reg->type]); - return -EACCES; - } - if (src_reg->imm < 48) { - verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n", - src_reg->imm); - return -EACCES; - } - - had_id = (dst_reg->id != 0); + if (reg->max_value > BPF_REGISTER_MAX_RANGE) + reg->max_value = BPF_REGISTER_MAX_RANGE; + if (reg->min_value < BPF_REGISTER_MIN_RANGE || + reg->min_value > BPF_REGISTER_MAX_RANGE) + reg->min_value = BPF_REGISTER_MIN_RANGE; +} - /* dst_reg stays as pkt_ptr type and since some positive - * integer value was added to the pointer, increment its 'id' - */ - dst_reg->id = ++env->id_gen; - - /* something was added to pkt_ptr, set range to zero */ - dst_reg->aux_off += dst_reg->off; - dst_reg->off = 0; - dst_reg->range = 0; - if (had_id) - dst_reg->aux_off_align = min(dst_reg->aux_off_align, - src_reg->min_align); - else - dst_reg->aux_off_align = src_reg->min_align; +static void coerce_reg_to_32(struct bpf_reg_state *reg) +{ + /* 32-bit values can't be negative as an s64 */ + if (reg->min_value < 0) + reg->min_value = 0; + /* clear high 32 bits */ + reg->var_off = tnum_cast(reg->var_off, 4); + /* Did value become known? Then update bounds */ + if (tnum_is_const(reg->var_off)) { + if ((s64)reg->var_off.value > BPF_REGISTER_MIN_RANGE) + reg->min_value = reg->var_off.value; + if (reg->var_off.value < BPF_REGISTER_MAX_RANGE) + reg->max_value = reg->var_off.value; } - return 0; } -static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) +/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. + * Caller must check_reg_overflow all argument regs beforehand. + * Caller should also handle BPF_MOV case separately. + * If we return -EACCES, caller may want to try again treating pointer as a + * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks. + */ +static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, + struct bpf_insn *insn, + const struct bpf_reg_state *ptr_reg, + const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = env->cur_state.regs; - struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + bool known = tnum_is_const(off_reg->var_off); + s64 min_val = off_reg->min_value; + u64 max_val = off_reg->max_value; u8 opcode = BPF_OP(insn->code); - s64 imm_log2; + u32 dst = insn->dst_reg; - /* for type == UNKNOWN_VALUE: - * imm > 0 -> number of zero upper bits - * imm == 0 -> don't track which is the same as all bits can be non-zero - */ + dst_reg = ®s[dst]; - if (BPF_SRC(insn->code) == BPF_X) { - struct bpf_reg_state *src_reg = ®s[insn->src_reg]; - - if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && - dst_reg->imm && opcode == BPF_ADD) { - /* dreg += sreg - * where both have zero upper bits. Adding them - * can only result making one more bit non-zero - * in the larger value. - * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) - * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) - */ - dst_reg->imm = min(dst_reg->imm, src_reg->imm); - dst_reg->imm--; - return 0; - } - if (src_reg->type == CONST_IMM && src_reg->imm > 0 && - dst_reg->imm && opcode == BPF_ADD) { - /* dreg += sreg - * where dreg has zero upper bits and sreg is const. - * Adding them can only result making one more bit - * non-zero in the larger value. - */ - imm_log2 = __ilog2_u64((long long)src_reg->imm); - dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); - dst_reg->imm--; - return 0; - } - /* all other cases non supported yet, just mark dst_reg */ - dst_reg->imm = 0; - return 0; + if (WARN_ON_ONCE(known && (min_val != max_val))) { + print_verifier_state(&env->cur_state); + verbose("verifier internal error\n"); + return -EINVAL; + } + + if (BPF_CLASS(insn->code) != BPF_ALU64) { + /* 32-bit ALU ops on pointers produce (meaningless) scalars */ + if (!env->allow_ptr_leaks) + verbose("R%d 32-bit pointer arithmetic prohibited\n", + dst); + return -EACCES; } - /* sign extend 32-bit imm into 64-bit to make sure that - * negative values occupy bit 63. Note ilog2() would have - * been incorrect, since sizeof(insn->imm) == 4 + if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { + if (!env->allow_ptr_leaks) + verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + dst); + return -EACCES; + } + if (ptr_reg->type == CONST_PTR_TO_MAP) { + if (!env->allow_ptr_leaks) + verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + dst); + return -EACCES; + } + if (ptr_reg->type == PTR_TO_PACKET_END) { + if (!env->allow_ptr_leaks) + verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + dst); + return -EACCES; + } + + /* In case of 'scalar += pointer', dst_reg inherits pointer type and id. + * The id may be overwritten later if we create a new variable offset. */ - imm_log2 = __ilog2_u64((long long)insn->imm); + dst_reg->type = ptr_reg->type; + dst_reg->id = ptr_reg->id; - if (dst_reg->imm && opcode == BPF_LSH) { - /* reg <<= imm - * if reg was a result of 2 byte load, then its imm == 48 - * which means that upper 48 bits are zero and shifting this reg - * left by 4 would mean that upper 44 bits are still zero + switch (opcode) { + case BPF_ADD: + /* We can take a fixed offset as long as it doesn't overflow + * the s32 'off' field */ - dst_reg->imm -= insn->imm; - } else if (dst_reg->imm && opcode == BPF_MUL) { - /* reg *= imm - * if multiplying by 14 subtract 4 - * This is conservative calculation of upper zero bits. - * It's not trying to special case insn->imm == 1 or 0 cases + if (known && (ptr_reg->off + min_val == + (s64)(s32)(ptr_reg->off + min_val))) { + /* pointer += K. Accumulate it into fixed offset */ + dst_reg->min_value = ptr_reg->min_value; + dst_reg->max_value = ptr_reg->max_value; + dst_reg->var_off = ptr_reg->var_off; + dst_reg->off = ptr_reg->off + min_val; + dst_reg->range = ptr_reg->range; + break; + } + if (max_val == BPF_REGISTER_MAX_RANGE) { + if (!env->allow_ptr_leaks) + verbose("R%d tried to add unbounded value to pointer\n", + dst); + return -EACCES; + } + /* A new variable offset is created. Note that off_reg->off + * == 0, since it's a scalar. + * dst_reg gets the pointer type and since some positive + * integer value was added to the pointer, give it a new 'id' + * if it's a PTR_TO_PACKET. + * this creates a new 'base' pointer, off_reg (variable) gets + * added into the variable offset, and we copy the fixed offset + * from ptr_reg. */ - dst_reg->imm -= imm_log2 + 1; - } else if (opcode == BPF_AND) { - /* reg &= imm */ - dst_reg->imm = 63 - imm_log2; - } else if (dst_reg->imm && opcode == BPF_ADD) { - /* reg += imm */ - dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); - dst_reg->imm--; - } else if (opcode == BPF_RSH) { - /* reg >>= imm - * which means that after right shift, upper bits will be zero - * note that verifier already checked that - * 0 <= imm < 64 for shift insn + if (min_val <= BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value += min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value += max_val; + dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); + dst_reg->off = ptr_reg->off; + if (ptr_reg->type == PTR_TO_PACKET) { + dst_reg->id = ++env->id_gen; + /* something was added to pkt_ptr, set range to zero */ + dst_reg->range = 0; + } + break; + case BPF_SUB: + if (dst_reg == off_reg) { + /* scalar -= pointer. Creates an unknown scalar */ + if (!env->allow_ptr_leaks) + verbose("R%d tried to subtract pointer from scalar\n", + dst); + return -EACCES; + } + /* We don't allow subtraction from FP, because (according to + * test_verifier.c test "invalid fp arithmetic", JITs might not + * be able to deal with it. */ - dst_reg->imm += insn->imm; - if (unlikely(dst_reg->imm > 64)) - /* some dumb code did: - * r2 = *(u32 *)mem; - * r2 >>= 32; - * and all bits are zero now */ - dst_reg->imm = 64; - } else { - /* all other alu ops, means that we don't know what will - * happen to the value, mark it with unknown number of zero bits + if (ptr_reg->type == PTR_TO_STACK) { + if (!env->allow_ptr_leaks) + verbose("R%d subtraction from stack pointer prohibited\n", + dst); + return -EACCES; + } + if (known && (ptr_reg->off - min_val == + (s64)(s32)(ptr_reg->off - min_val))) { + /* pointer -= K. Subtract it from fixed offset */ + dst_reg->min_value = ptr_reg->min_value; + dst_reg->max_value = ptr_reg->max_value; + dst_reg->var_off = ptr_reg->var_off; + dst_reg->id = ptr_reg->id; + dst_reg->off = ptr_reg->off - min_val; + dst_reg->range = ptr_reg->range; + break; + } + /* Subtracting a negative value will just confuse everything. + * This can happen if off_reg is an immediate. */ - dst_reg->imm = 0; - } - - if (dst_reg->imm < 0) { - /* all 64 bits of the register can contain non-zero bits - * and such value cannot be added to ptr_to_packet, since it - * may overflow, mark it as unknown to avoid further eval + if ((s64)max_val < 0) { + if (!env->allow_ptr_leaks) + verbose("R%d tried to subtract negative max_val %lld from pointer\n", + dst, (s64)max_val); + return -EACCES; + } + /* A new variable offset is created. If the subtrahend is known + * nonnegative, then any reg->range we had before is still good. */ - dst_reg->imm = 0; - } - return 0; -} - -static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env, - struct bpf_insn *insn) -{ - struct bpf_reg_state *regs = env->cur_state.regs; - struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; - struct bpf_reg_state *src_reg = ®s[insn->src_reg]; - u8 opcode = BPF_OP(insn->code); - s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm); - - /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */ - if (src_reg->imm > 0 && dst_reg->imm) { - switch (opcode) { - case BPF_ADD: - /* dreg += sreg - * where both have zero upper bits. Adding them - * can only result making one more bit non-zero - * in the larger value. - * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) - * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) - */ - dst_reg->imm = min(src_reg->imm, 63 - imm_log2); - dst_reg->imm--; - break; - case BPF_AND: - /* dreg &= sreg - * AND can not extend zero bits only shrink - * Ex. 0x00..00ffffff - * & 0x0f..ffffffff - * ---------------- - * 0x00..00ffffff - */ - dst_reg->imm = max(src_reg->imm, 63 - imm_log2); - break; - case BPF_OR: - /* dreg |= sreg - * OR can only extend zero bits - * Ex. 0x00..00ffffff - * | 0x0f..ffffffff - * ---------------- - * 0x0f..00ffffff - */ - dst_reg->imm = min(src_reg->imm, 63 - imm_log2); - break; - case BPF_SUB: - case BPF_MUL: - case BPF_RSH: - case BPF_LSH: - /* These may be flushed out later */ - default: - mark_reg_unknown_value(regs, insn->dst_reg); + if (max_val >= BPF_REGISTER_MAX_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value -= max_val; + if (min_val <= BPF_REGISTER_MIN_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value -= min_val; + dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); + dst_reg->off = ptr_reg->off; + if (ptr_reg->type == PTR_TO_PACKET) { + dst_reg->id = ++env->id_gen; + /* something was added to pkt_ptr, set range to zero */ + if (min_val < 0) + dst_reg->range = 0; } - } else { - mark_reg_unknown_value(regs, insn->dst_reg); + break; + case BPF_AND: + case BPF_OR: + case BPF_XOR: + /* bitwise ops on pointers are troublesome, prohibit for now. + * (However, in principle we could allow some cases, e.g. + * ptr &= ~3 which would reduce min_value by 3.) + */ + if (!env->allow_ptr_leaks) + verbose("R%d bitwise operator %s on pointer prohibited\n", + dst, bpf_alu_string[opcode >> 4]); + return -EACCES; + default: + /* other operators (e.g. MUL,LSH) produce non-pointer results */ + if (!env->allow_ptr_leaks) + verbose("R%d pointer arithmetic with %s operator prohibited\n", + dst, bpf_alu_string[opcode >> 4]); + return -EACCES; } - dst_reg->type = UNKNOWN_VALUE; + check_reg_overflow(dst_reg); return 0; } -static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, - struct bpf_insn *insn) +static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_reg_state *dst_reg, + struct bpf_reg_state src_reg) { struct bpf_reg_state *regs = env->cur_state.regs; - struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; - struct bpf_reg_state *src_reg = ®s[insn->src_reg]; - u8 opcode = BPF_OP(insn->code); - u64 dst_imm = dst_reg->imm; - - if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE) - return evaluate_reg_imm_alu_unknown(env, insn); - - /* dst_reg->type == CONST_IMM here. Simulate execution of insns - * containing ALU ops. Don't care about overflow or negative - * values, just add/sub/... them; registers are in u64. - */ - if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) { - dst_imm += insn->imm; - } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) { - dst_imm += src_reg->imm; - } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) { - dst_imm -= insn->imm; - } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) { - dst_imm -= src_reg->imm; - } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) { - dst_imm *= insn->imm; - } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) { - dst_imm *= src_reg->imm; - } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) { - dst_imm |= insn->imm; - } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) { - dst_imm |= src_reg->imm; - } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) { - dst_imm &= insn->imm; - } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) { - dst_imm &= src_reg->imm; - } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) { - dst_imm >>= insn->imm; - } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) { - dst_imm >>= src_reg->imm; - } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) { - dst_imm <<= insn->imm; - } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) { - dst_imm <<= src_reg->imm; - } else { - mark_reg_unknown_value(regs, insn->dst_reg); - goto out; - } - - dst_reg->imm = dst_imm; -out: - return 0; -} - -static void check_reg_overflow(struct bpf_reg_state *reg) -{ - if (reg->max_value > BPF_REGISTER_MAX_RANGE) - reg->max_value = BPF_REGISTER_MAX_RANGE; - if (reg->min_value < BPF_REGISTER_MIN_RANGE || - reg->min_value > BPF_REGISTER_MAX_RANGE) - reg->min_value = BPF_REGISTER_MIN_RANGE; -} - -static u32 calc_align(u32 imm) -{ - if (!imm) - return 1U << 31; - return imm - ((imm - 1) & imm); -} - -static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, - struct bpf_insn *insn) -{ - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; s64 min_val = BPF_REGISTER_MIN_RANGE; u64 max_val = BPF_REGISTER_MAX_RANGE; u8 opcode = BPF_OP(insn->code); - u32 dst_align, src_align; + bool src_known, dst_known; - dst_reg = ®s[insn->dst_reg]; - src_align = 0; - if (BPF_SRC(insn->code) == BPF_X) { - check_reg_overflow(®s[insn->src_reg]); - min_val = regs[insn->src_reg].min_value; - max_val = regs[insn->src_reg].max_value; - - /* If the source register is a random pointer then the - * min_value/max_value values represent the range of the known - * accesses into that value, not the actual min/max value of the - * register itself. In this case we have to reset the reg range - * values so we know it is not safe to look at. - */ - if (regs[insn->src_reg].type != CONST_IMM && - regs[insn->src_reg].type != UNKNOWN_VALUE) { - min_val = BPF_REGISTER_MIN_RANGE; - max_val = BPF_REGISTER_MAX_RANGE; - src_align = 0; - } else { - src_align = regs[insn->src_reg].min_align; - } - } else if (insn->imm < BPF_REGISTER_MAX_RANGE && - (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { - min_val = max_val = insn->imm; - src_align = calc_align(insn->imm); - } - - dst_align = dst_reg->min_align; - - /* We don't know anything about what was done to this register, mark it - * as unknown. Also, if both derived bounds came from signed/unsigned - * mixed compares and one side is unbounded, we cannot really do anything - * with them as boundaries cannot be trusted. Thus, arithmetic of two - * regs of such kind will get invalidated bounds on the dst side. - */ - if ((min_val == BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) || - (BPF_SRC(insn->code) == BPF_X && - ((min_val != BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) || - (min_val == BPF_REGISTER_MIN_RANGE && - max_val != BPF_REGISTER_MAX_RANGE) || - (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && - dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || - (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && - dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && - regs[insn->dst_reg].value_from_signed != - regs[insn->src_reg].value_from_signed)) { - reset_reg_range_values(regs, insn->dst_reg); - return; - } - - /* If one of our values was at the end of our ranges then we can't just - * do our normal operations to the register, we need to set the values - * to the min/max since they are undefined. - */ - if (opcode != BPF_SUB) { - if (min_val == BPF_REGISTER_MIN_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (max_val == BPF_REGISTER_MAX_RANGE) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (BPF_CLASS(insn->code) != BPF_ALU64) { + /* 32-bit ALU ops are (32,32)->64 */ + coerce_reg_to_32(dst_reg); + coerce_reg_to_32(&src_reg); } + min_val = src_reg.min_value; + max_val = src_reg.max_value; + src_known = tnum_is_const(src_reg.var_off); + dst_known = tnum_is_const(dst_reg->var_off); switch (opcode) { case BPF_ADD: + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value += min_val; + /* if max_val is MAX_RANGE, this will saturate dst->max */ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value += max_val; - dst_reg->min_align = min(src_align, dst_align); + dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: - /* If one of our values was at the end of our ranges, then the - * _opposite_ value in the dst_reg goes to the end of our range. - */ - if (min_val == BPF_REGISTER_MIN_RANGE) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; if (max_val == BPF_REGISTER_MAX_RANGE) dst_reg->min_value = BPF_REGISTER_MIN_RANGE; if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value -= max_val; + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value -= min_val; - dst_reg->min_align = min(src_align, dst_align); + dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; case BPF_MUL: - if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value *= min_val; + if (min_val < 0 || dst_reg->min_value < 0) { + /* Ain't nobody got time to multiply that sign */ + __mark_reg_unknown(dst_reg); + break; + } + dst_reg->min_value *= min_val; + /* if max_val is MAX_RANGE, this will saturate dst->max. + * We know MAX_RANGE ** 2 won't overflow a u64, because + * MAX_RANGE itself fits in a u32. + */ + BUILD_BUG_ON(BPF_REGISTER_MAX_RANGE > (u32)-1); if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value *= max_val; - dst_reg->min_align = max(src_align, dst_align); + dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); break; case BPF_AND: - /* Disallow AND'ing of negative numbers, ain't nobody got time - * for that. Otherwise the minimum is 0 and the max is the max - * value we could AND against. + if (src_known && dst_known) { + u64 value = dst_reg->var_off.value & src_reg.var_off.value; + + dst_reg->var_off = tnum_const(value); + dst_reg->min_value = dst_reg->max_value = min_t(u64, + value, BPF_REGISTER_MAX_RANGE); + break; + } + /* Lose min_value when AND'ing negative numbers, ain't nobody + * got time for that. Otherwise we get our minimum from the + * var_off, since that's inherently bitwise. + * Our maximum is the minimum of the operands' maxima. */ - if (min_val < 0) + dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); + if (min_val < 0 && dst_reg->min_value < 0) dst_reg->min_value = BPF_REGISTER_MIN_RANGE; else - dst_reg->min_value = 0; - dst_reg->max_value = max_val; - dst_reg->min_align = max(src_align, dst_align); + dst_reg->min_value = dst_reg->var_off.value; + dst_reg->max_value = min(dst_reg->max_value, max_val); + break; + case BPF_OR: + if (src_known && dst_known) { + u64 value = dst_reg->var_off.value | src_reg.var_off.value; + + dst_reg->var_off = tnum_const(value); + dst_reg->min_value = dst_reg->max_value = min_t(u64, + value, BPF_REGISTER_MAX_RANGE); + break; + } + /* Lose ranges when OR'ing negative numbers, ain't nobody got + * time for that. Otherwise we get our maximum from the var_off, + * and our minimum is the maximum of the operands' minima. + */ + dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off); + if (min_val < 0 || dst_reg->min_value < 0) { + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + } else { + dst_reg->min_value = max(dst_reg->min_value, min_val); + dst_reg->max_value = dst_reg->var_off.value | dst_reg->var_off.mask; + } break; case BPF_LSH: + if (min_val < 0) { + /* LSH by a negative number is undefined */ + mark_reg_unknown(regs, insn->dst_reg); + break; + } /* Gotta have special overflow logic here, if we're shifting * more than MAX_RANGE then just assume we have an invalid * range. */ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) { dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - dst_reg->min_align = 1; + dst_reg->var_off = tnum_unknown; } else { if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value <<= min_val; - if (!dst_reg->min_align) - dst_reg->min_align = 1; - dst_reg->min_align <<= min_val; + if (src_known) + dst_reg->var_off = tnum_lshift(dst_reg->var_off, min_val); + else + dst_reg->var_off = tnum_lshift(tnum_unknown, min_val); } if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->max_value = BPF_REGISTER_MAX_RANGE; @@ -1946,37 +1938,139 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg->max_value <<= max_val; break; case BPF_RSH: - /* RSH by a negative number is undefined, and the BPF_RSH is an - * unsigned shift, so make the appropriate casts. - */ - if (min_val < 0 || dst_reg->min_value < 0) { - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (min_val < 0) { + /* RSH by a negative number is undefined */ + mark_reg_unknown(regs, insn->dst_reg); + break; + } + /* BPF_RSH is an unsigned shift, so make the appropriate casts */ + if (dst_reg->min_value < 0) { + if (min_val) + /* Sign bit will be cleared */ + dst_reg->min_value = 0; } else { dst_reg->min_value = (u64)(dst_reg->min_value) >> min_val; } - if (min_val < 0) { - dst_reg->min_align = 1; - } else { - dst_reg->min_align >>= (u64) min_val; - if (!dst_reg->min_align) - dst_reg->min_align = 1; - } - if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value >>= max_val; + if (src_known) + dst_reg->var_off = tnum_rshift(dst_reg->var_off, min_val); + else + dst_reg->var_off = tnum_rshift(tnum_unknown, min_val); + if (dst_reg->max_value == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = ~0; + dst_reg->max_value >>= max_val; break; default: - reset_reg_range_values(regs, insn->dst_reg); + mark_reg_unknown(regs, insn->dst_reg); break; } check_reg_overflow(dst_reg); + return 0; +} + +/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max + * and var_off. + */ +static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg; + struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; + u8 opcode = BPF_OP(insn->code); + int rc; + + dst_reg = ®s[insn->dst_reg]; + check_reg_overflow(dst_reg); + src_reg = NULL; + if (dst_reg->type != SCALAR_VALUE) + ptr_reg = dst_reg; + if (BPF_SRC(insn->code) == BPF_X) { + src_reg = ®s[insn->src_reg]; + check_reg_overflow(src_reg); + + if (src_reg->type != SCALAR_VALUE) { + if (dst_reg->type != SCALAR_VALUE) { + /* Combining two pointers by any ALU op yields + * an arbitrary scalar. + */ + if (!env->allow_ptr_leaks) { + verbose("R%d pointer %s pointer prohibited\n", + insn->dst_reg, + bpf_alu_string[opcode >> 4]); + return -EACCES; + } + mark_reg_unknown(regs, insn->dst_reg); + return 0; + } else { + /* scalar += pointer + * This is legal, but we have to reverse our + * src/dest handling in computing the range + */ + rc = adjust_ptr_min_max_vals(env, insn, + src_reg, dst_reg); + if (rc == -EACCES && env->allow_ptr_leaks) { + /* scalar += unknown scalar */ + __mark_reg_unknown(&off_reg); + return adjust_scalar_min_max_vals( + env, insn, + dst_reg, off_reg); + } + return rc; + } + } else if (ptr_reg) { + /* pointer += scalar */ + rc = adjust_ptr_min_max_vals(env, insn, + dst_reg, src_reg); + if (rc == -EACCES && env->allow_ptr_leaks) { + /* unknown scalar += scalar */ + __mark_reg_unknown(dst_reg); + return adjust_scalar_min_max_vals( + env, insn, dst_reg, *src_reg); + } + return rc; + } + } else { + /* Pretend the src is a reg with a known value, since we only + * need to be able to read from this state. + */ + off_reg.type = SCALAR_VALUE; + off_reg.var_off = tnum_const(insn->imm); + off_reg.min_value = insn->imm; + off_reg.max_value = insn->imm; + src_reg = &off_reg; + check_reg_overflow(src_reg); + if (ptr_reg) { /* pointer += K */ + rc = adjust_ptr_min_max_vals(env, insn, + ptr_reg, src_reg); + if (rc == -EACCES && env->allow_ptr_leaks) { + /* unknown scalar += K */ + __mark_reg_unknown(dst_reg); + return adjust_scalar_min_max_vals( + env, insn, dst_reg, off_reg); + } + return rc; + } + } + + /* Got here implies adding two SCALAR_VALUEs */ + if (WARN_ON_ONCE(ptr_reg)) { + print_verifier_state(&env->cur_state); + verbose("verifier internal error: unexpected ptr_reg\n"); + return -EINVAL; + } + if (WARN_ON(!src_reg)) { + print_verifier_state(&env->cur_state); + verbose("verifier internal error: no src_reg\n"); + return -EINVAL; + } + return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); } /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = env->cur_state.regs; u8 opcode = BPF_OP(insn->code); int err; @@ -2036,11 +2130,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; - /* we are setting our register to something new, we need to - * reset its range values. - */ - reset_reg_range_values(regs, insn->dst_reg); - if (BPF_SRC(insn->code) == BPF_X) { if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 @@ -2048,24 +2137,29 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg] = regs[insn->src_reg]; } else { + /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { verbose("R%d partial copy of pointer\n", insn->src_reg); return -EACCES; } - mark_reg_unknown_value(regs, insn->dst_reg); + mark_reg_unknown(regs, insn->dst_reg); + /* high 32 bits are known zero. But this is + * still out of range for max_value, so leave + * that. + */ + regs[insn->dst_reg].var_off = tnum_cast( + regs[insn->dst_reg].var_off, 4); } } else { /* case: R = imm * remember the value we stored into this reg */ - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = insn->imm; - regs[insn->dst_reg].id = 0; + regs[insn->dst_reg].type = SCALAR_VALUE; + regs[insn->dst_reg].var_off = tnum_const(insn->imm); regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; - regs[insn->dst_reg].min_align = calc_align(insn->imm); - regs[insn->dst_reg].value_from_signed = false; + regs[insn->dst_reg].id = 0; } } else if (opcode > BPF_END) { @@ -2116,68 +2210,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; - dst_reg = ®s[insn->dst_reg]; - - /* first we want to adjust our ranges. */ - adjust_reg_min_max_vals(env, insn); - - /* pattern match 'bpf_add Rx, imm' instruction */ - if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && - dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { - dst_reg->type = PTR_TO_STACK; - dst_reg->imm = insn->imm; - return 0; - } else if (opcode == BPF_ADD && - BPF_CLASS(insn->code) == BPF_ALU64 && - dst_reg->type == PTR_TO_STACK && - ((BPF_SRC(insn->code) == BPF_X && - regs[insn->src_reg].type == CONST_IMM) || - BPF_SRC(insn->code) == BPF_K)) { - if (BPF_SRC(insn->code) == BPF_X) - dst_reg->imm += regs[insn->src_reg].imm; - else - dst_reg->imm += insn->imm; - return 0; - } else if (opcode == BPF_ADD && - BPF_CLASS(insn->code) == BPF_ALU64 && - (dst_reg->type == PTR_TO_PACKET || - (BPF_SRC(insn->code) == BPF_X && - regs[insn->src_reg].type == PTR_TO_PACKET))) { - /* ptr_to_packet += K|X */ - return check_packet_ptr_add(env, insn); - } else if (BPF_CLASS(insn->code) == BPF_ALU64 && - dst_reg->type == UNKNOWN_VALUE && - env->allow_ptr_leaks) { - /* unknown += K|X */ - return evaluate_reg_alu(env, insn); - } else if (BPF_CLASS(insn->code) == BPF_ALU64 && - dst_reg->type == CONST_IMM && - env->allow_ptr_leaks) { - /* reg_imm += K|X */ - return evaluate_reg_imm_alu(env, insn); - } else if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer arithmetic prohibited\n", - insn->dst_reg); - return -EACCES; - } else if (BPF_SRC(insn->code) == BPF_X && - is_pointer_value(env, insn->src_reg)) { - verbose("R%d pointer arithmetic prohibited\n", - insn->src_reg); - return -EACCES; - } - - /* If we did pointer math on a map value then just set it to our - * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or - * loads to this register appropriately, otherwise just mark the - * register as unknown. - */ - if (env->allow_ptr_leaks && - BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD && - (dst_reg->type == PTR_TO_MAP_VALUE || - dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) - dst_reg->type = PTR_TO_MAP_VALUE_ADJ; - else - mark_reg_unknown_value(regs, insn->dst_reg); + return adjust_reg_min_max_vals(env, insn); } return 0; @@ -2189,6 +2222,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, struct bpf_reg_state *regs = state->regs, *reg; int i; + if (dst_reg->off < 0) + /* This doesn't give us any range */ + return; + + if (dst_reg->max_value > MAX_PACKET_OFF || + dst_reg->max_value + dst_reg->off > MAX_PACKET_OFF) + /* Risk of overflow. For instance, ptr + (1<<63) may be less + * than pkt_end, but that's because it's also less than pkt. + */ + return; + /* LLVM can generate two kind of checks: * * Type 1: @@ -2219,30 +2263,44 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * so that range of bytes [r3, r3 + 8) is safe to access. */ + /* If our ids match, then we must have the same max_value. And we + * don't care about the other reg's fixed offset, since if it's too big + * the range won't allow anything. + * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. + */ for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ - regs[i].range = max(regs[i].range, dst_reg->off); + regs[i].range = max_t(u16, regs[i].range, dst_reg->off); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) - reg->range = max(reg->range, dst_reg->off); + reg->range = max_t(u16, reg->range, dst_reg->off); } } /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. + * In JEQ/JNE cases we also adjust the var_off values. */ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { bool value_from_signed = true; - bool is_range = true; + + /* If the dst_reg is a pointer, we can't learn anything about its + * variable offset from the compare (unless src_reg were a pointer into + * the same object, but we don't bother with that. + * Since false_reg and true_reg have the same type by construction, we + * only need to check one of them for pointerness. + */ + if (__is_pointer_value(false, false_reg)) + return; switch (opcode) { case BPF_JEQ: @@ -2250,14 +2308,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; - is_range = false; + true_reg->var_off = tnum_const(val); break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; - is_range = false; + false_reg->var_off = tnum_const(val); break; case BPF_JGT: value_from_signed = false; @@ -2305,23 +2363,19 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); - if (is_range) { - if (__is_pointer_value(false, false_reg)) - reset_reg_range_values(false_reg, 0); - if (__is_pointer_value(false, true_reg)) - reset_reg_range_values(true_reg, 0); - } } -/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg - * is the variable reg. +/* Same as above, but for the case that dst_reg holds a constant and src_reg is + * the variable reg. */ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { bool value_from_signed = true; - bool is_range = true; + + if (__is_pointer_value(false, false_reg)) + return; switch (opcode) { case BPF_JEQ: @@ -2329,14 +2383,14 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; - is_range = false; + true_reg->var_off = tnum_const(val); break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; - is_range = false; + false_reg->var_off = tnum_const(val); break; case BPF_JGT: value_from_signed = false; @@ -2385,27 +2439,60 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); - if (is_range) { - if (__is_pointer_value(false, false_reg)) - reset_reg_range_values(false_reg, 0); - if (__is_pointer_value(false, true_reg)) - reset_reg_range_values(true_reg, 0); +} + +/* Regs are known to be equal, so intersect their min/max/var_off */ +static void __reg_combine_min_max(struct bpf_reg_state *src_reg, + struct bpf_reg_state *dst_reg) +{ + src_reg->min_value = dst_reg->min_value = max(src_reg->min_value, + dst_reg->min_value); + src_reg->max_value = dst_reg->max_value = min(src_reg->max_value, + dst_reg->max_value); + src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off, + dst_reg->var_off); + check_reg_overflow(src_reg); + check_reg_overflow(dst_reg); +} + +static void reg_combine_min_max(struct bpf_reg_state *true_src, + struct bpf_reg_state *true_dst, + struct bpf_reg_state *false_src, + struct bpf_reg_state *false_dst, + u8 opcode) +{ + switch (opcode) { + case BPF_JEQ: + __reg_combine_min_max(true_src, true_dst); + break; + case BPF_JNE: + __reg_combine_min_max(false_src, false_dst); } } static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, - enum bpf_reg_type type) + bool is_null) { struct bpf_reg_state *reg = ®s[regno]; if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { - if (type == UNKNOWN_VALUE) { - __mark_reg_unknown_value(regs, regno); + /* Old offset (both fixed and variable parts) should + * have been known-zero, because we don't allow pointer + * arithmetic on pointers that might be NULL. + */ + if (WARN_ON_ONCE(reg->min_value || reg->max_value || + reg->var_off.value || reg->var_off.mask || + reg->off)) { + reg->min_value = reg->max_value = reg->off = 0; + reg->var_off = tnum_const(0); + } + if (is_null) { + reg->type = SCALAR_VALUE; } else if (reg->map_ptr->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; reg->map_ptr = reg->map_ptr->inner_map_meta; } else { - reg->type = type; + reg->type = PTR_TO_MAP_VALUE; } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances @@ -2419,19 +2506,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, * be folded together at some point. */ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, - enum bpf_reg_type type) + bool is_null) { struct bpf_reg_state *regs = state->regs; u32 id = regs[regno].id; int i; for (i = 0; i < MAX_BPF_REG; i++) - mark_map_reg(regs, i, id, type); + mark_map_reg(regs, i, id, is_null); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; - mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type); + mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null); } } @@ -2481,7 +2568,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* detect if R == 0 where R was initialized to zero earlier */ if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && - dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) { + dst_reg->type == SCALAR_VALUE && + tnum_equals_const(dst_reg->var_off, insn->imm)) { if (opcode == BPF_JEQ) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through @@ -2503,17 +2591,30 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. + * this is only legit if both are scalars (or pointers to the same + * object, I suppose, but we don't support that right now), because + * otherwise the different base pointers mean the offsets aren't + * comparable. */ if (BPF_SRC(insn->code) == BPF_X) { - if (regs[insn->src_reg].type == CONST_IMM) - reg_set_min_max(&other_branch->regs[insn->dst_reg], - dst_reg, regs[insn->src_reg].imm, - opcode); - else if (dst_reg->type == CONST_IMM) - reg_set_min_max_inv(&other_branch->regs[insn->src_reg], - ®s[insn->src_reg], dst_reg->imm, - opcode); - } else { + if (dst_reg->type == SCALAR_VALUE && + regs[insn->src_reg].type == SCALAR_VALUE) { + if (tnum_is_const(regs[insn->src_reg].var_off)) + reg_set_min_max(&other_branch->regs[insn->dst_reg], + dst_reg, regs[insn->src_reg].var_off.value, + opcode); + else if (tnum_is_const(dst_reg->var_off)) + reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + ®s[insn->src_reg], + dst_reg->var_off.value, opcode); + else if (opcode == BPF_JEQ || opcode == BPF_JNE) + /* Comparing for equality, we can combine knowledge */ + reg_combine_min_max(&other_branch->regs[insn->src_reg], + &other_branch->regs[insn->dst_reg], + ®s[insn->src_reg], + ®s[insn->dst_reg], opcode); + } + } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch->regs[insn->dst_reg], dst_reg, insn->imm, opcode); } @@ -2525,10 +2626,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* Mark all identical map registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ - mark_map_regs(this_branch, insn->dst_reg, - opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE); - mark_map_regs(other_branch, insn->dst_reg, - opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE); + mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); + mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { @@ -2576,8 +2675,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->src_reg == 0) { u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = imm; + regs[insn->dst_reg].type = SCALAR_VALUE; + regs[insn->dst_reg].min_value = imm; + regs[insn->dst_reg].max_value = imm; + check_reg_overflow(®s[insn->dst_reg]); + regs[insn->dst_reg].var_off = tnum_const(imm); regs[insn->dst_reg].id = 0; return 0; } @@ -2659,7 +2761,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* mark destination R0 register as readable, since it contains * the value fetched from the packet */ - regs[BPF_REG_0].type = UNKNOWN_VALUE; + mark_reg_unknown(regs, BPF_REG_0); return 0; } @@ -2862,57 +2964,145 @@ err_free: return ret; } -/* the following conditions reduce the number of explored insns - * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet +/* check %cur's range satisfies %old's */ +static bool range_within(struct bpf_reg_state *old, + struct bpf_reg_state *cur) +{ + return old->min_value <= cur->min_value && + old->max_value >= cur->max_value; +} + +/* Maximum number of register states that can exist at once */ +#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) +struct idpair { + u32 old; + u32 cur; +}; + +/* If in the old state two registers had the same id, then they need to have + * the same id in the new state as well. But that id could be different from + * the old state, so we need to track the mapping from old to new ids. + * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent + * regs with old id 5 must also have new id 9 for the new state to be safe. But + * regs with a different old id could still have new id 9, we don't care about + * that. + * So we look through our idmap to see if this old id has been seen before. If + * so, we require the new id to match; otherwise, we add the id pair to the map. */ -static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, - struct bpf_reg_state *old, - struct bpf_reg_state *cur) +static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) { - if (old->id != cur->id) - return false; + unsigned int i; + + for (i = 0; i < ID_MAP_SIZE; i++) { + if (!idmap[i].old) { + /* Reached an empty slot; haven't seen this id before */ + idmap[i].old = old_id; + idmap[i].cur = cur_id; + return true; + } + if (idmap[i].old == old_id) + return idmap[i].cur == cur_id; + } + /* We ran out of idmap slots, which should be impossible */ + WARN_ON_ONCE(1); + return false; +} - /* old ptr_to_packet is more conservative, since it allows smaller - * range. Ex: - * old(off=0,r=10) is equal to cur(off=0,r=20), because - * old(off=0,r=10) means that with range=10 the verifier proceeded - * further and found no issues with the program. Now we're in the same - * spot with cur(off=0,r=20), so we're safe too, since anything further - * will only be looking at most 10 bytes after this pointer. - */ - if (old->off == cur->off && old->range < cur->range) +/* Returns true if (rold safe implies rcur safe) */ +static bool regsafe(struct bpf_reg_state *rold, + struct bpf_reg_state *rcur, + bool varlen_map_access, struct idpair *idmap) +{ + if (memcmp(rold, rcur, sizeof(*rold)) == 0) return true; - /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0) - * since both cannot be used for packet access and safe(old) - * pointer has smaller off that could be used for further - * 'if (ptr > data_end)' check - * Ex: - * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean - * that we cannot access the packet. - * The safe range is: - * [ptr, ptr + range - off) - * so whenever off >=range, it means no safe bytes from this pointer. - * When comparing old->off <= cur->off, it means that older code - * went with smaller offset and that offset was later - * used to figure out the safe range after 'if (ptr > data_end)' check - * Say, 'old' state was explored like: - * ... R3(off=0, r=0) - * R4 = R3 + 20 - * ... now R4(off=20,r=0) <-- here - * if (R4 > data_end) - * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access. - * ... the code further went all the way to bpf_exit. - * Now the 'cur' state at the mark 'here' has R4(off=30,r=0). - * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier - * goes further, such cur_R4 will give larger safe packet range after - * 'if (R4 > data_end)' and all further insn were already good with r=20, - * so they will be good with r=30 and we can prune the search. - */ - if (!env->strict_alignment && old->off <= cur->off && - old->off >= old->range && cur->off >= cur->range) + if (rold->type == NOT_INIT) + /* explored state can't have used this */ return true; + if (rcur->type == NOT_INIT) + return false; + switch (rold->type) { + case SCALAR_VALUE: + if (rcur->type == SCALAR_VALUE) { + /* new val must satisfy old val knowledge */ + return range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); + } else { + /* if we knew anything about the old value, we're not + * equal, because we can't know anything about the + * scalar value of the pointer in the new value. + */ + return rold->min_value == BPF_REGISTER_MIN_RANGE && + rold->max_value == BPF_REGISTER_MAX_RANGE && + tnum_is_unknown(rold->var_off); + } + case PTR_TO_MAP_VALUE: + if (varlen_map_access) { + /* If the new min/max/var_off satisfy the old ones and + * everything else matches, we are OK. + * We don't care about the 'id' value, because nothing + * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) + */ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); + } else { + /* If the ranges/var_off were not the same, but + * everything else was and we didn't do a variable + * access into a map then we are a-ok. + */ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0; + } + case PTR_TO_MAP_VALUE_OR_NULL: + /* a PTR_TO_MAP_VALUE could be safe to use as a + * PTR_TO_MAP_VALUE_OR_NULL into the same map. + * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- + * checked, doing so could have affected others with the same + * id, and we can't check for that because we lost the id when + * we converted to a PTR_TO_MAP_VALUE. + */ + if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) + return false; + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) + return false; + /* Check our ids match any regs they're supposed to */ + return check_ids(rold->id, rcur->id, idmap); + case PTR_TO_PACKET: + if (rcur->type != PTR_TO_PACKET) + return false; + /* We must have at least as much range as the old ptr + * did, so that any accesses which were safe before are + * still safe. This is true even if old range < old off, + * since someone could have accessed through (ptr - k), or + * even done ptr -= k in a register, to get a safe access. + */ + if (rold->range > rcur->range) + return false; + /* If the offsets don't match, we can't trust our alignment; + * nor can we be sure that we won't fall out of range. + */ + if (rold->off != rcur->off) + return false; + /* id relations must be preserved */ + if (rold->id && !check_ids(rold->id, rcur->id, idmap)) + return false; + /* new val must satisfy old val knowledge */ + return range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); + case PTR_TO_CTX: + case CONST_PTR_TO_MAP: + case PTR_TO_STACK: + case PTR_TO_PACKET_END: + /* Only valid matches are exact, which memcmp() above + * would have accepted + */ + default: + /* Don't know what's going on, just say it's not safe */ + return false; + } + /* Shouldn't get here; if we do, say it's not safe */ + WARN_ON_ONCE(1); return false; } @@ -2947,43 +3137,19 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { bool varlen_map_access = env->varlen_map_value_access; - struct bpf_reg_state *rold, *rcur; + struct idpair *idmap; + bool ret = false; int i; - for (i = 0; i < MAX_BPF_REG; i++) { - rold = &old->regs[i]; - rcur = &cur->regs[i]; - - if (memcmp(rold, rcur, sizeof(*rold)) == 0) - continue; - - /* If the ranges were not the same, but everything else was and - * we didn't do a variable access into a map then we are a-ok. - */ - if (!varlen_map_access && - memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0) - continue; - - /* If we didn't map access then again we don't care about the - * mismatched range values and it's ok if our old type was - * UNKNOWN and we didn't go to a NOT_INIT'ed reg. - */ - if (rold->type == NOT_INIT || - (!varlen_map_access && rold->type == UNKNOWN_VALUE && - rcur->type != NOT_INIT)) - continue; - - /* Don't care about the reg->id in this case. */ - if (rold->type == PTR_TO_MAP_VALUE_OR_NULL && - rcur->type == PTR_TO_MAP_VALUE_OR_NULL && - rold->map_ptr == rcur->map_ptr) - continue; - - if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && - compare_ptrs_to_packet(env, rold, rcur)) - continue; - + idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); + /* If we failed to allocate the idmap, just say it's not safe */ + if (!idmap) return false; + + for (i = 0; i < MAX_BPF_REG; i++) { + if (!regsafe(&old->regs[i], &cur->regs[i], varlen_map_access, + idmap)) + goto out_free; } for (i = 0; i < MAX_BPF_STACK; i++) { @@ -2995,29 +3161,32 @@ static bool states_equal(struct bpf_verifier_env *env, * this verifier states are not equivalent, * return false to continue verification of this path */ - return false; + goto out_free; if (i % BPF_REG_SIZE) continue; if (old->stack_slot_type[i] != STACK_SPILL) continue; - if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], - &cur->spilled_regs[i / BPF_REG_SIZE], - sizeof(old->spilled_regs[0]))) - /* when explored and current stack slot types are - * the same, check that stored pointers types + if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE], + &cur->spilled_regs[i / BPF_REG_SIZE], + varlen_map_access, idmap)) + /* when explored and current stack slot are both storing + * spilled registers, check that stored pointers types * are the same as well. * Ex: explored safe path could have stored - * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8} + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} * but current path has stored: - * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16} + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} * such verifier states are not equivalent. * return false to continue verification of this path */ - return false; + goto out_free; else continue; } - return true; + ret = true; +out_free: + kfree(idmap); + return ret; } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) @@ -3331,7 +3500,6 @@ process_bpf_exit: verbose("invalid BPF_LD mode\n"); return -EINVAL; } - reset_reg_range_values(regs, insn->dst_reg); } else { verbose("unknown insn class %d\n", class); return -EINVAL; -- cgit v1.2.3 From b03c9f9fdc37dab81ea04d5dacdc5995d4c224c2 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 7 Aug 2017 15:26:36 +0100 Subject: bpf/verifier: track signed and unsigned min/max values Allows us to, sometimes, combine information from a signed check of one bound and an unsigned check of the other. We now track the full range of possible values, rather than restricting ourselves to [0, 1<<30) and considering anything beyond that as unknown. While this is probably not necessary, it makes the code more straightforward and symmetrical between signed and unsigned bounds. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- kernel/bpf/tnum.c | 16 ++ kernel/bpf/verifier.c | 737 +++++++++++++++++++++++++++++--------------------- 2 files changed, 445 insertions(+), 308 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 92eeeb1974a2..1f4bf68c12db 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -17,6 +17,22 @@ struct tnum tnum_const(u64 value) return TNUM(value, 0); } +struct tnum tnum_range(u64 min, u64 max) +{ + u64 chi = min ^ max, delta; + u8 bits = fls64(chi); + + /* special case, needed because 1ULL << 64 is undefined */ + if (bits > 63) + return tnum_unknown; + /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7. + * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return + * constant min (since min == max). + */ + delta = (1ULL << bits) - 1; + return TNUM(min & ~delta, delta); +} + struct tnum tnum_lshift(struct tnum a, u8 shift) { return TNUM(a.value << shift, a.mask << shift); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c3f88b466c30..7557800bf7a7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,12 +234,20 @@ static void print_verifier_state(struct bpf_verifier_state *state) verbose(",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); - if (reg->min_value != BPF_REGISTER_MIN_RANGE) - verbose(",min_value=%lld", - (long long)reg->min_value); - if (reg->max_value != BPF_REGISTER_MAX_RANGE) - verbose(",max_value=%llu", - (unsigned long long)reg->max_value); + if (reg->smin_value != reg->umin_value && + reg->smin_value != S64_MIN) + verbose(",smin_value=%lld", + (long long)reg->smin_value); + if (reg->smax_value != reg->umax_value && + reg->smax_value != S64_MAX) + verbose(",smax_value=%lld", + (long long)reg->smax_value); + if (reg->umin_value != 0) + verbose(",umin_value=%llu", + (unsigned long long)reg->umin_value); + if (reg->umax_value != U64_MAX) + verbose(",umax_value=%llu", + (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; @@ -466,14 +474,25 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void __mark_reg_not_init(struct bpf_reg_state *reg); +/* Mark the unknown part of a register (variable offset or scalar value) as + * known to have the value @imm. + */ +static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) +{ + reg->id = 0; + reg->var_off = tnum_const(imm); + reg->smin_value = (s64)imm; + reg->smax_value = (s64)imm; + reg->umin_value = imm; + reg->umax_value = imm; +} + /* Mark the 'variable offset' part of a register as zero. This should be * used only on registers holding a pointer type. */ static void __mark_reg_known_zero(struct bpf_reg_state *reg) { - reg->var_off = tnum_const(0); - reg->min_value = 0; - reg->max_value = 0; + __mark_reg_known(reg, 0); } static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) @@ -488,6 +507,72 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) __mark_reg_known_zero(regs + regno); } +/* Attempts to improve min/max values based on var_off information */ +static void __update_reg_bounds(struct bpf_reg_state *reg) +{ + /* min signed is max(sign bit) | min(other bits) */ + reg->smin_value = max_t(s64, reg->smin_value, + reg->var_off.value | (reg->var_off.mask & S64_MIN)); + /* max signed is min(sign bit) | max(other bits) */ + reg->smax_value = min_t(s64, reg->smax_value, + reg->var_off.value | (reg->var_off.mask & S64_MAX)); + reg->umin_value = max(reg->umin_value, reg->var_off.value); + reg->umax_value = min(reg->umax_value, + reg->var_off.value | reg->var_off.mask); +} + +/* Uses signed min/max values to inform unsigned, and vice-versa */ +static void __reg_deduce_bounds(struct bpf_reg_state *reg) +{ + /* Learn sign from signed bounds. + * If we cannot cross the sign boundary, then signed and unsigned bounds + * are the same, so combine. This works even in the negative case, e.g. + * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. + */ + if (reg->smin_value >= 0 || reg->smax_value < 0) { + reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, + reg->umin_value); + reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, + reg->umax_value); + return; + } + /* Learn sign from unsigned bounds. Signed bounds cross the sign + * boundary, so we must be careful. + */ + if ((s64)reg->umax_value >= 0) { + /* Positive. We can't learn anything from the smin, but smax + * is positive, hence safe. + */ + reg->smin_value = reg->umin_value; + reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, + reg->umax_value); + } else if ((s64)reg->umin_value < 0) { + /* Negative. We can't learn anything from the smax, but smin + * is negative, hence safe. + */ + reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, + reg->umin_value); + reg->smax_value = reg->umax_value; + } +} + +/* Attempts to improve var_off based on unsigned min/max information */ +static void __reg_bound_offset(struct bpf_reg_state *reg) +{ + reg->var_off = tnum_intersect(reg->var_off, + tnum_range(reg->umin_value, + reg->umax_value)); +} + +/* Reset the min/max bounds of a register */ +static void __mark_reg_unbounded(struct bpf_reg_state *reg) +{ + reg->smin_value = S64_MIN; + reg->smax_value = S64_MAX; + reg->umin_value = 0; + reg->umax_value = U64_MAX; +} + /* Mark a register as having a completely unknown (scalar) value. */ static void __mark_reg_unknown(struct bpf_reg_state *reg) { @@ -495,8 +580,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->id = 0; reg->off = 0; reg->var_off = tnum_unknown; - reg->min_value = BPF_REGISTER_MIN_RANGE; - reg->max_value = BPF_REGISTER_MAX_RANGE; + __mark_reg_unbounded(reg); } static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) @@ -545,13 +629,6 @@ static void init_reg_state(struct bpf_reg_state *regs) mark_reg_known_zero(regs, BPF_REG_1); } -static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) -{ - regs[regno].min_value = BPF_REGISTER_MIN_RANGE; - regs[regno].max_value = BPF_REGISTER_MAX_RANGE; - regs[regno].value_from_signed = false; -} - enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -716,26 +793,27 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->min_value < 0) { + if (reg->smin_value < 0) { verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->min_value + off, size); + err = __check_map_access(env, regno, reg->smin_value + off, size); if (err) { verbose("R%d min value is outside of the array range\n", regno); return err; } - /* If we haven't set a max value then we need to bail - * since we can't be sure we won't do bad things. + /* If we haven't set a max value then we need to bail since we can't be + * sure we won't do bad things. + * If reg->umax_value + off could overflow, treat that as unbounded too. */ - if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + if (reg->umax_value >= BPF_MAX_VAR_OFF) { verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->max_value + off, size); + err = __check_map_access(env, regno, reg->umax_value + off, size); if (err) verbose("R%d max value is outside of the array range\n", regno); return err; @@ -797,7 +875,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* We don't allow negative numbers, because we aren't tracking enough * detail to prove they're safe. */ - if (reg->min_value < 0) { + if (reg->smin_value < 0) { verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; @@ -1070,12 +1148,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* b/h/w load zero-extends, mark upper bits as known 0 */ state->regs[value_regno].var_off = tnum_cast( state->regs[value_regno].var_off, size); - /* sign bit is known zero, so we can bound the value */ - state->regs[value_regno].min_value = 0; - state->regs[value_regno].max_value = min_t(u64, - state->regs[value_regno].var_off.value | - state->regs[value_regno].var_off.mask, - BPF_REGISTER_MAX_RANGE); + __update_reg_bounds(&state->regs[value_regno]); } return err; } @@ -1333,13 +1406,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ meta = NULL; - if (reg->min_value < 0) { + if (reg->smin_value < 0) { verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", regno); return -EACCES; } - if (reg->min_value == 0) { + if (reg->umin_value == 0) { err = check_helper_mem_access(env, regno - 1, 0, zero_size_allowed, meta); @@ -1347,13 +1420,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return err; } - if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + if (reg->umax_value >= BPF_MAX_VAR_SIZ) { verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", regno); return -EACCES; } err = check_helper_mem_access(env, regno - 1, - reg->max_value, + reg->umax_value, zero_size_allowed, meta); } @@ -1600,33 +1673,35 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return 0; } -static void check_reg_overflow(struct bpf_reg_state *reg) -{ - if (reg->max_value > BPF_REGISTER_MAX_RANGE) - reg->max_value = BPF_REGISTER_MAX_RANGE; - if (reg->min_value < BPF_REGISTER_MIN_RANGE || - reg->min_value > BPF_REGISTER_MAX_RANGE) - reg->min_value = BPF_REGISTER_MIN_RANGE; -} - static void coerce_reg_to_32(struct bpf_reg_state *reg) { - /* 32-bit values can't be negative as an s64 */ - if (reg->min_value < 0) - reg->min_value = 0; /* clear high 32 bits */ reg->var_off = tnum_cast(reg->var_off, 4); - /* Did value become known? Then update bounds */ - if (tnum_is_const(reg->var_off)) { - if ((s64)reg->var_off.value > BPF_REGISTER_MIN_RANGE) - reg->min_value = reg->var_off.value; - if (reg->var_off.value < BPF_REGISTER_MAX_RANGE) - reg->max_value = reg->var_off.value; - } + /* Update bounds */ + __update_reg_bounds(reg); +} + +static bool signed_add_overflows(s64 a, s64 b) +{ + /* Do the add in u64, where overflow is well-defined */ + s64 res = (s64)((u64)a + (u64)b); + + if (b < 0) + return res > a; + return res < a; +} + +static bool signed_sub_overflows(s64 a, s64 b) +{ + /* Do the sub in u64, where overflow is well-defined */ + s64 res = (s64)((u64)a - (u64)b); + + if (b < 0) + return res < a; + return res > a; } /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. - * Caller must check_reg_overflow all argument regs beforehand. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks. @@ -1638,16 +1713,23 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, { struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); - s64 min_val = off_reg->min_value; - u64 max_val = off_reg->max_value; + s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, + smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; + u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, + umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; dst_reg = ®s[dst]; - if (WARN_ON_ONCE(known && (min_val != max_val))) { + if (WARN_ON_ONCE(known && (smin_val != smax_val))) { + print_verifier_state(&env->cur_state); + verbose("verifier internal error: known but bad sbounds\n"); + return -EINVAL; + } + if (WARN_ON_ONCE(known && (umin_val != umax_val))) { print_verifier_state(&env->cur_state); - verbose("verifier internal error\n"); + verbose("verifier internal error: known but bad ubounds\n"); return -EINVAL; } @@ -1689,22 +1771,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ - if (known && (ptr_reg->off + min_val == - (s64)(s32)(ptr_reg->off + min_val))) { + if (known && (ptr_reg->off + smin_val == + (s64)(s32)(ptr_reg->off + smin_val))) { /* pointer += K. Accumulate it into fixed offset */ - dst_reg->min_value = ptr_reg->min_value; - dst_reg->max_value = ptr_reg->max_value; + dst_reg->smin_value = smin_ptr; + dst_reg->smax_value = smax_ptr; + dst_reg->umin_value = umin_ptr; + dst_reg->umax_value = umax_ptr; dst_reg->var_off = ptr_reg->var_off; - dst_reg->off = ptr_reg->off + min_val; + dst_reg->off = ptr_reg->off + smin_val; dst_reg->range = ptr_reg->range; break; } - if (max_val == BPF_REGISTER_MAX_RANGE) { - if (!env->allow_ptr_leaks) - verbose("R%d tried to add unbounded value to pointer\n", - dst); - return -EACCES; - } /* A new variable offset is created. Note that off_reg->off * == 0, since it's a scalar. * dst_reg gets the pointer type and since some positive @@ -1714,12 +1792,22 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * added into the variable offset, and we copy the fixed offset * from ptr_reg. */ - if (min_val <= BPF_REGISTER_MIN_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value += min_val; - if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value += max_val; + if (signed_add_overflows(smin_ptr, smin_val) || + signed_add_overflows(smax_ptr, smax_val)) { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value = smin_ptr + smin_val; + dst_reg->smax_value = smax_ptr + smax_val; + } + if (umin_ptr + umin_val < umin_ptr || + umax_ptr + umax_val < umax_ptr) { + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + dst_reg->umin_value = umin_ptr + umin_val; + dst_reg->umax_value = umax_ptr + umax_val; + } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; if (ptr_reg->type == PTR_TO_PACKET) { @@ -1746,43 +1834,46 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst); return -EACCES; } - if (known && (ptr_reg->off - min_val == - (s64)(s32)(ptr_reg->off - min_val))) { + if (known && (ptr_reg->off - smin_val == + (s64)(s32)(ptr_reg->off - smin_val))) { /* pointer -= K. Subtract it from fixed offset */ - dst_reg->min_value = ptr_reg->min_value; - dst_reg->max_value = ptr_reg->max_value; + dst_reg->smin_value = smin_ptr; + dst_reg->smax_value = smax_ptr; + dst_reg->umin_value = umin_ptr; + dst_reg->umax_value = umax_ptr; dst_reg->var_off = ptr_reg->var_off; dst_reg->id = ptr_reg->id; - dst_reg->off = ptr_reg->off - min_val; + dst_reg->off = ptr_reg->off - smin_val; dst_reg->range = ptr_reg->range; break; } - /* Subtracting a negative value will just confuse everything. - * This can happen if off_reg is an immediate. - */ - if ((s64)max_val < 0) { - if (!env->allow_ptr_leaks) - verbose("R%d tried to subtract negative max_val %lld from pointer\n", - dst, (s64)max_val); - return -EACCES; - } /* A new variable offset is created. If the subtrahend is known * nonnegative, then any reg->range we had before is still good. */ - if (max_val >= BPF_REGISTER_MAX_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value -= max_val; - if (min_val <= BPF_REGISTER_MIN_RANGE) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; - if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value -= min_val; + if (signed_sub_overflows(smin_ptr, smax_val) || + signed_sub_overflows(smax_ptr, smin_val)) { + /* Overflow possible, we know nothing */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value = smin_ptr - smax_val; + dst_reg->smax_value = smax_ptr - smin_val; + } + if (umin_ptr < umax_val) { + /* Overflow possible, we know nothing */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + /* Cannot overflow (as long as bounds are consistent) */ + dst_reg->umin_value = umin_ptr - umax_val; + dst_reg->umax_value = umax_ptr - umin_val; + } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; if (ptr_reg->type == PTR_TO_PACKET) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - if (min_val < 0) + if (smin_val < 0) dst_reg->range = 0; } break; @@ -1805,7 +1896,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - check_reg_overflow(dst_reg); + __update_reg_bounds(dst_reg); + __reg_deduce_bounds(dst_reg); + __reg_bound_offset(dst_reg); return 0; } @@ -1815,157 +1908,217 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { struct bpf_reg_state *regs = env->cur_state.regs; - s64 min_val = BPF_REGISTER_MIN_RANGE; - u64 max_val = BPF_REGISTER_MAX_RANGE; u8 opcode = BPF_OP(insn->code); bool src_known, dst_known; + s64 smin_val, smax_val; + u64 umin_val, umax_val; if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops are (32,32)->64 */ coerce_reg_to_32(dst_reg); coerce_reg_to_32(&src_reg); } - min_val = src_reg.min_value; - max_val = src_reg.max_value; + smin_val = src_reg.smin_value; + smax_val = src_reg.smax_value; + umin_val = src_reg.umin_value; + umax_val = src_reg.umax_value; src_known = tnum_is_const(src_reg.var_off); dst_known = tnum_is_const(dst_reg->var_off); switch (opcode) { case BPF_ADD: - if (min_val == BPF_REGISTER_MIN_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value += min_val; - /* if max_val is MAX_RANGE, this will saturate dst->max */ - if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value += max_val; + if (signed_add_overflows(dst_reg->smin_value, smin_val) || + signed_add_overflows(dst_reg->smax_value, smax_val)) { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value += smin_val; + dst_reg->smax_value += smax_val; + } + if (dst_reg->umin_value + umin_val < umin_val || + dst_reg->umax_value + umax_val < umax_val) { + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + dst_reg->umin_value += umin_val; + dst_reg->umax_value += umax_val; + } dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: - if (max_val == BPF_REGISTER_MAX_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value -= max_val; - if (min_val == BPF_REGISTER_MIN_RANGE) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; - if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value -= min_val; + if (signed_sub_overflows(dst_reg->smin_value, smax_val) || + signed_sub_overflows(dst_reg->smax_value, smin_val)) { + /* Overflow possible, we know nothing */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value -= smax_val; + dst_reg->smax_value -= smin_val; + } + if (dst_reg->umin_value < umax_val) { + /* Overflow possible, we know nothing */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + /* Cannot overflow (as long as bounds are consistent) */ + dst_reg->umin_value -= umax_val; + dst_reg->umax_value -= umin_val; + } dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; case BPF_MUL: - if (min_val < 0 || dst_reg->min_value < 0) { + dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); + if (smin_val < 0 || dst_reg->smin_value < 0) { /* Ain't nobody got time to multiply that sign */ - __mark_reg_unknown(dst_reg); + __mark_reg_unbounded(dst_reg); + __update_reg_bounds(dst_reg); break; } - dst_reg->min_value *= min_val; - /* if max_val is MAX_RANGE, this will saturate dst->max. - * We know MAX_RANGE ** 2 won't overflow a u64, because - * MAX_RANGE itself fits in a u32. + /* Both values are positive, so we can work with unsigned and + * copy the result to signed (unless it exceeds S64_MAX). */ - BUILD_BUG_ON(BPF_REGISTER_MAX_RANGE > (u32)-1); - if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value *= max_val; - dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); + if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) { + /* Potential overflow, we know nothing */ + __mark_reg_unbounded(dst_reg); + /* (except what we can learn from the var_off) */ + __update_reg_bounds(dst_reg); + break; + } + dst_reg->umin_value *= umin_val; + dst_reg->umax_value *= umax_val; + if (dst_reg->umax_value > S64_MAX) { + /* Overflow possible, we know nothing */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } break; case BPF_AND: if (src_known && dst_known) { - u64 value = dst_reg->var_off.value & src_reg.var_off.value; - - dst_reg->var_off = tnum_const(value); - dst_reg->min_value = dst_reg->max_value = min_t(u64, - value, BPF_REGISTER_MAX_RANGE); + __mark_reg_known(dst_reg, dst_reg->var_off.value & + src_reg.var_off.value); break; } - /* Lose min_value when AND'ing negative numbers, ain't nobody - * got time for that. Otherwise we get our minimum from the - * var_off, since that's inherently bitwise. - * Our maximum is the minimum of the operands' maxima. + /* We get our minimum from the var_off, since that's inherently + * bitwise. Our maximum is the minimum of the operands' maxima. */ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); - if (min_val < 0 && dst_reg->min_value < 0) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else - dst_reg->min_value = dst_reg->var_off.value; - dst_reg->max_value = min(dst_reg->max_value, max_val); + dst_reg->umin_value = dst_reg->var_off.value; + dst_reg->umax_value = min(dst_reg->umax_value, umax_val); + if (dst_reg->smin_value < 0 || smin_val < 0) { + /* Lose signed bounds when ANDing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + /* ANDing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); break; case BPF_OR: if (src_known && dst_known) { - u64 value = dst_reg->var_off.value | src_reg.var_off.value; - - dst_reg->var_off = tnum_const(value); - dst_reg->min_value = dst_reg->max_value = min_t(u64, - value, BPF_REGISTER_MAX_RANGE); + __mark_reg_known(dst_reg, dst_reg->var_off.value | + src_reg.var_off.value); break; } - /* Lose ranges when OR'ing negative numbers, ain't nobody got - * time for that. Otherwise we get our maximum from the var_off, - * and our minimum is the maximum of the operands' minima. + /* We get our maximum from the var_off, and our minimum is the + * maximum of the operands' minima */ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off); - if (min_val < 0 || dst_reg->min_value < 0) { - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + dst_reg->umin_value = max(dst_reg->umin_value, umin_val); + dst_reg->umax_value = dst_reg->var_off.value | + dst_reg->var_off.mask; + if (dst_reg->smin_value < 0 || smin_val < 0) { + /* Lose signed bounds when ORing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } else { - dst_reg->min_value = max(dst_reg->min_value, min_val); - dst_reg->max_value = dst_reg->var_off.value | dst_reg->var_off.mask; + /* ORing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; } + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); break; case BPF_LSH: - if (min_val < 0) { - /* LSH by a negative number is undefined */ + if (umax_val > 63) { + /* Shifts greater than 63 are undefined. This includes + * shifts by a negative number. + */ mark_reg_unknown(regs, insn->dst_reg); break; } - /* Gotta have special overflow logic here, if we're shifting - * more than MAX_RANGE then just assume we have an invalid - * range. + /* We lose all sign bit information (except what we can pick + * up from var_off) */ - if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) { - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - dst_reg->var_off = tnum_unknown; + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + /* If we might shift our top bit out, then we know nothing */ + if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; } else { - if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value <<= min_val; - if (src_known) - dst_reg->var_off = tnum_lshift(dst_reg->var_off, min_val); - else - dst_reg->var_off = tnum_lshift(tnum_unknown, min_val); + dst_reg->umin_value <<= umin_val; + dst_reg->umax_value <<= umax_val; } - if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; - else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value <<= max_val; + if (src_known) + dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); + else + dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val); + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); break; case BPF_RSH: - if (min_val < 0) { - /* RSH by a negative number is undefined */ + if (umax_val > 63) { + /* Shifts greater than 63 are undefined. This includes + * shifts by a negative number. + */ mark_reg_unknown(regs, insn->dst_reg); break; } /* BPF_RSH is an unsigned shift, so make the appropriate casts */ - if (dst_reg->min_value < 0) { - if (min_val) + if (dst_reg->smin_value < 0) { + if (umin_val) { /* Sign bit will be cleared */ - dst_reg->min_value = 0; + dst_reg->smin_value = 0; + } else { + /* Lost sign bit information */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } } else { - dst_reg->min_value = - (u64)(dst_reg->min_value) >> min_val; + dst_reg->smin_value = + (u64)(dst_reg->smin_value) >> umax_val; } if (src_known) - dst_reg->var_off = tnum_rshift(dst_reg->var_off, min_val); + dst_reg->var_off = tnum_rshift(dst_reg->var_off, + umin_val); else - dst_reg->var_off = tnum_rshift(tnum_unknown, min_val); - if (dst_reg->max_value == BPF_REGISTER_MAX_RANGE) - dst_reg->max_value = ~0; - dst_reg->max_value >>= max_val; + dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val); + dst_reg->umin_value >>= umax_val; + dst_reg->umax_value >>= umin_val; + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); break; default: mark_reg_unknown(regs, insn->dst_reg); break; } - check_reg_overflow(dst_reg); + __reg_deduce_bounds(dst_reg); + __reg_bound_offset(dst_reg); return 0; } @@ -1981,14 +2134,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, int rc; dst_reg = ®s[insn->dst_reg]; - check_reg_overflow(dst_reg); src_reg = NULL; if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; if (BPF_SRC(insn->code) == BPF_X) { src_reg = ®s[insn->src_reg]; - check_reg_overflow(src_reg); - if (src_reg->type != SCALAR_VALUE) { if (dst_reg->type != SCALAR_VALUE) { /* Combining two pointers by any ALU op yields @@ -2035,11 +2185,8 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * need to be able to read from this state. */ off_reg.type = SCALAR_VALUE; - off_reg.var_off = tnum_const(insn->imm); - off_reg.min_value = insn->imm; - off_reg.max_value = insn->imm; + __mark_reg_known(&off_reg, insn->imm); src_reg = &off_reg; - check_reg_overflow(src_reg); if (ptr_reg) { /* pointer += K */ rc = adjust_ptr_min_max_vals(env, insn, ptr_reg, src_reg); @@ -2144,22 +2291,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } mark_reg_unknown(regs, insn->dst_reg); - /* high 32 bits are known zero. But this is - * still out of range for max_value, so leave - * that. - */ + /* high 32 bits are known zero. */ regs[insn->dst_reg].var_off = tnum_cast( regs[insn->dst_reg].var_off, 4); + __update_reg_bounds(®s[insn->dst_reg]); } } else { /* case: R = imm * remember the value we stored into this reg */ regs[insn->dst_reg].type = SCALAR_VALUE; - regs[insn->dst_reg].var_off = tnum_const(insn->imm); - regs[insn->dst_reg].max_value = insn->imm; - regs[insn->dst_reg].min_value = insn->imm; - regs[insn->dst_reg].id = 0; + __mark_reg_known(regs + insn->dst_reg, insn->imm); } } else if (opcode > BPF_END) { @@ -2226,8 +2368,8 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* This doesn't give us any range */ return; - if (dst_reg->max_value > MAX_PACKET_OFF || - dst_reg->max_value + dst_reg->off > MAX_PACKET_OFF) + if (dst_reg->umax_value > MAX_PACKET_OFF || + dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt. */ @@ -2291,8 +2433,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { - bool value_from_signed = true; - /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into * the same object, but we don't bother with that. @@ -2307,62 +2447,45 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ - true_reg->max_value = true_reg->min_value = val; - true_reg->var_off = tnum_const(val); + __mark_reg_known(true_reg, val); break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ - false_reg->max_value = false_reg->min_value = val; - false_reg->var_off = tnum_const(val); + __mark_reg_known(false_reg, val); break; case BPF_JGT: - value_from_signed = false; - /* fallthrough */ + false_reg->umax_value = min(false_reg->umax_value, val); + true_reg->umin_value = max(true_reg->umin_value, val + 1); + break; case BPF_JSGT: - if (true_reg->value_from_signed != value_from_signed) - reset_reg_range_values(true_reg, 0); - if (false_reg->value_from_signed != value_from_signed) - reset_reg_range_values(false_reg, 0); - if (opcode == BPF_JGT) { - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; - } - /* If this is false then we know the maximum val is val, - * otherwise we know the min val is val+1. - */ - false_reg->max_value = val; - false_reg->value_from_signed = value_from_signed; - true_reg->min_value = val + 1; - true_reg->value_from_signed = value_from_signed; + false_reg->smax_value = min_t(s64, false_reg->smax_value, val); + true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); break; case BPF_JGE: - value_from_signed = false; - /* fallthrough */ + false_reg->umax_value = min(false_reg->umax_value, val - 1); + true_reg->umin_value = max(true_reg->umin_value, val); + break; case BPF_JSGE: - if (true_reg->value_from_signed != value_from_signed) - reset_reg_range_values(true_reg, 0); - if (false_reg->value_from_signed != value_from_signed) - reset_reg_range_values(false_reg, 0); - if (opcode == BPF_JGE) { - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; - } - /* If this is false then we know the maximum value is val - 1, - * otherwise we know the mimimum value is val. - */ - false_reg->max_value = val - 1; - false_reg->value_from_signed = value_from_signed; - true_reg->min_value = val; - true_reg->value_from_signed = value_from_signed; + false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); + true_reg->smin_value = max_t(s64, true_reg->smin_value, val); break; default: break; } - check_reg_overflow(false_reg); - check_reg_overflow(true_reg); + __reg_deduce_bounds(false_reg); + __reg_deduce_bounds(true_reg); + /* We might have learned some bits from the bounds. */ + __reg_bound_offset(false_reg); + __reg_bound_offset(true_reg); + /* Intersecting with the old var_off might have improved our bounds + * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), + * then new var_off is (0; 0x7f...fc) which improves our umax. + */ + __update_reg_bounds(false_reg); + __update_reg_bounds(true_reg); } /* Same as above, but for the case that dst_reg holds a constant and src_reg is @@ -2372,8 +2495,6 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { - bool value_from_signed = true; - if (__is_pointer_value(false, false_reg)) return; @@ -2382,77 +2503,76 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ - true_reg->max_value = true_reg->min_value = val; - true_reg->var_off = tnum_const(val); + __mark_reg_known(true_reg, val); break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ - false_reg->max_value = false_reg->min_value = val; - false_reg->var_off = tnum_const(val); + __mark_reg_known(false_reg, val); break; case BPF_JGT: - value_from_signed = false; - /* fallthrough */ + true_reg->umax_value = min(true_reg->umax_value, val - 1); + false_reg->umin_value = max(false_reg->umin_value, val); + break; case BPF_JSGT: - if (true_reg->value_from_signed != value_from_signed) - reset_reg_range_values(true_reg, 0); - if (false_reg->value_from_signed != value_from_signed) - reset_reg_range_values(false_reg, 0); - if (opcode == BPF_JGT) { - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; - } - /* - * If this is false, then the val is <= the register, if it is - * true the register <= to the val. - */ - false_reg->min_value = val; - false_reg->value_from_signed = value_from_signed; - true_reg->max_value = val - 1; - true_reg->value_from_signed = value_from_signed; + true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); + false_reg->smin_value = max_t(s64, false_reg->smin_value, val); break; case BPF_JGE: - value_from_signed = false; - /* fallthrough */ + true_reg->umax_value = min(true_reg->umax_value, val); + false_reg->umin_value = max(false_reg->umin_value, val + 1); + break; case BPF_JSGE: - if (true_reg->value_from_signed != value_from_signed) - reset_reg_range_values(true_reg, 0); - if (false_reg->value_from_signed != value_from_signed) - reset_reg_range_values(false_reg, 0); - if (opcode == BPF_JGE) { - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; - } - /* If this is false then constant < register, if it is true then - * the register < constant. - */ - false_reg->min_value = val + 1; - false_reg->value_from_signed = value_from_signed; - true_reg->max_value = val; - true_reg->value_from_signed = value_from_signed; + true_reg->smax_value = min_t(s64, true_reg->smax_value, val); + false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); break; default: break; } - check_reg_overflow(false_reg); - check_reg_overflow(true_reg); + __reg_deduce_bounds(false_reg); + __reg_deduce_bounds(true_reg); + /* We might have learned some bits from the bounds. */ + __reg_bound_offset(false_reg); + __reg_bound_offset(true_reg); + /* Intersecting with the old var_off might have improved our bounds + * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), + * then new var_off is (0; 0x7f...fc) which improves our umax. + */ + __update_reg_bounds(false_reg); + __update_reg_bounds(true_reg); } /* Regs are known to be equal, so intersect their min/max/var_off */ static void __reg_combine_min_max(struct bpf_reg_state *src_reg, struct bpf_reg_state *dst_reg) { - src_reg->min_value = dst_reg->min_value = max(src_reg->min_value, - dst_reg->min_value); - src_reg->max_value = dst_reg->max_value = min(src_reg->max_value, - dst_reg->max_value); + src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value, + dst_reg->umin_value); + src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value, + dst_reg->umax_value); + src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value, + dst_reg->smin_value); + src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value, + dst_reg->smax_value); src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off, dst_reg->var_off); - check_reg_overflow(src_reg); - check_reg_overflow(dst_reg); + /* We might have learned new bounds from the var_off. */ + __update_reg_bounds(src_reg); + __update_reg_bounds(dst_reg); + /* We might have learned something about the sign bit. */ + __reg_deduce_bounds(src_reg); + __reg_deduce_bounds(dst_reg); + /* We might have learned some bits from the bounds. */ + __reg_bound_offset(src_reg); + __reg_bound_offset(dst_reg); + /* Intersecting with the old var_off might have improved our bounds + * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), + * then new var_off is (0; 0x7f...fc) which improves our umax. + */ + __update_reg_bounds(src_reg); + __update_reg_bounds(dst_reg); } static void reg_combine_min_max(struct bpf_reg_state *true_src, @@ -2467,6 +2587,7 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src, break; case BPF_JNE: __reg_combine_min_max(false_src, false_dst); + break; } } @@ -2480,11 +2601,11 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. */ - if (WARN_ON_ONCE(reg->min_value || reg->max_value || - reg->var_off.value || reg->var_off.mask || + if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || + !tnum_equals_const(reg->var_off, 0) || reg->off)) { - reg->min_value = reg->max_value = reg->off = 0; - reg->var_off = tnum_const(0); + __mark_reg_known_zero(reg); + reg->off = 0; } if (is_null) { reg->type = SCALAR_VALUE; @@ -2676,11 +2797,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; regs[insn->dst_reg].type = SCALAR_VALUE; - regs[insn->dst_reg].min_value = imm; - regs[insn->dst_reg].max_value = imm; - check_reg_overflow(®s[insn->dst_reg]); - regs[insn->dst_reg].var_off = tnum_const(imm); - regs[insn->dst_reg].id = 0; + __mark_reg_known(®s[insn->dst_reg], imm); return 0; } @@ -2968,8 +3085,10 @@ err_free: static bool range_within(struct bpf_reg_state *old, struct bpf_reg_state *cur) { - return old->min_value <= cur->min_value && - old->max_value >= cur->max_value; + return old->umin_value <= cur->umin_value && + old->umax_value >= cur->umax_value && + old->smin_value <= cur->smin_value && + old->smax_value >= cur->smax_value; } /* Maximum number of register states that can exist at once */ @@ -3032,8 +3151,10 @@ static bool regsafe(struct bpf_reg_state *rold, * equal, because we can't know anything about the * scalar value of the pointer in the new value. */ - return rold->min_value == BPF_REGISTER_MIN_RANGE && - rold->max_value == BPF_REGISTER_MAX_RANGE && + return rold->umin_value == 0 && + rold->umax_value == U64_MAX && + rold->smin_value == S64_MIN && + rold->smax_value == S64_MAX && tnum_is_unknown(rold->var_off); } case PTR_TO_MAP_VALUE: -- cgit v1.2.3 From 7d1238f21026e277936fff408b73bc19e89239a8 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 7 Aug 2017 15:26:56 +0100 Subject: bpf/verifier: more concise register state logs for constant var_off Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7557800bf7a7..08a6fa0369c2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,25 +234,33 @@ static void print_verifier_state(struct bpf_verifier_state *state) verbose(",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); - if (reg->smin_value != reg->umin_value && - reg->smin_value != S64_MIN) - verbose(",smin_value=%lld", - (long long)reg->smin_value); - if (reg->smax_value != reg->umax_value && - reg->smax_value != S64_MAX) - verbose(",smax_value=%lld", - (long long)reg->smax_value); - if (reg->umin_value != 0) - verbose(",umin_value=%llu", - (unsigned long long)reg->umin_value); - if (reg->umax_value != U64_MAX) - verbose(",umax_value=%llu", - (unsigned long long)reg->umax_value); - if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(",var_off=%s", tn_buf); + if (tnum_is_const(reg->var_off)) { + /* Typically an immediate SCALAR_VALUE, but + * could be a pointer whose offset is too big + * for reg->off + */ + verbose(",imm=%llx", reg->var_off.value); + } else { + if (reg->smin_value != reg->umin_value && + reg->smin_value != S64_MIN) + verbose(",smin_value=%lld", + (long long)reg->smin_value); + if (reg->smax_value != reg->umax_value && + reg->smax_value != S64_MAX) + verbose(",smax_value=%lld", + (long long)reg->smax_value); + if (reg->umin_value != 0) + verbose(",umin_value=%llu", + (unsigned long long)reg->umin_value); + if (reg->umax_value != U64_MAX) + verbose(",umax_value=%llu", + (unsigned long long)reg->umax_value); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(",var_off=%s", tn_buf); + } } verbose(")"); } -- cgit v1.2.3 From 8e17c1b16277cba0e9426de6fe78817df378f45c Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 7 Aug 2017 15:30:30 +0100 Subject: bpf/verifier: increase complexity limit to 128k The more detailed value tracking can reduce the effectiveness of pruning for some programs. So, to avoid rejecting previously valid programs, up the limit to 128kinsns. Hopefully we will be able to bring this back down later by improving pruning performance. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 08a6fa0369c2..8160a81a40bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 98304 +#define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 #define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) -- cgit v1.2.3 From 58291a7465f6b88248c9f34807c16705bd5698f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Mon, 7 Aug 2017 20:45:19 +0200 Subject: bpf: Move check_uarg_tail_zero() upward MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function check_uarg_tail_zero() may be useful for other part of the code in the syscall.c file. Move this function at the beginning of the file. Signed-off-by: Mickaël Salaün Acked-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: David S. Miller Cc: Kees Cook Cc: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6c772adabad2..c653ee0bd162 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -48,6 +48,32 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #undef BPF_MAP_TYPE }; +static int check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) +{ + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + int err; + + if (actual_size <= expected_size) + return 0; + + addr = uaddr + expected_size; + end = uaddr + actual_size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + + return 0; +} + static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { struct bpf_map *map; @@ -1246,32 +1272,6 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } -static int check_uarg_tail_zero(void __user *uaddr, - size_t expected_size, - size_t actual_size) -{ - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - int err; - - if (actual_size <= expected_size) - return 0; - - addr = uaddr + expected_size; - end = uaddr + actual_size; - - for (; addr < end; addr++) { - err = get_user(val, addr); - if (err) - return err; - if (val) - return -E2BIG; - } - - return 0; -} - static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) -- cgit v1.2.3 From 752ba56fb130b27c32c2ae6c82c8ef246b22106c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Mon, 7 Aug 2017 20:45:20 +0200 Subject: bpf: Extend check_uarg_tail_zero() checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function check_uarg_tail_zero() was created from bpf(2) for BPF_OBJ_GET_INFO_BY_FD without taking the access_ok() nor the PAGE_SIZE checks. Make this checks more generally available while unlikely to be triggered, extend the memory range check and add an explanation including why the ToCToU should not be a security concern. Signed-off-by: Mickaël Salaün Acked-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: David S. Miller Cc: Kees Cook Cc: Martin KaFai Lau Link: https://lkml.kernel.org/r/CAGXu5j+vRGFvJZmjtAcT8Hi8B+Wz0e1b6VKYZHfQP_=DXzC4CQ@mail.gmail.com Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c653ee0bd162..fbe09a0cccf4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -48,6 +48,15 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #undef BPF_MAP_TYPE }; +/* + * If we're handed a bigger struct than we know of, ensure all the unknown bits + * are 0 - i.e. new user-space does not rely on any kernel feature extensions + * we don't know about yet. + * + * There is a ToCToU between this function call and the following + * copy_from_user() call. However, this is not a concern since this function is + * meant to be a future-proofing of bits. + */ static int check_uarg_tail_zero(void __user *uaddr, size_t expected_size, size_t actual_size) @@ -57,6 +66,12 @@ static int check_uarg_tail_zero(void __user *uaddr, unsigned char val; int err; + if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ + return -E2BIG; + + if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size))) + return -EFAULT; + if (actual_size <= expected_size) return 0; @@ -1393,17 +1408,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) return -EPERM; - if (!access_ok(VERIFY_READ, uattr, 1)) - return -EFAULT; - - if (size > PAGE_SIZE) /* silly large */ - return -E2BIG; - - /* If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ err = check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; -- cgit v1.2.3 From 48fb6f4db940e92cfb16cd878cddd59ea6120d06 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 9 Aug 2017 08:27:11 +0100 Subject: futex: Remove unnecessary warning from get_futex_key Commit 65d8fc777f6d ("futex: Remove requirement for lock_page() in get_futex_key()") removed an unnecessary lock_page() with the side-effect that page->mapping needed to be treated very carefully. Two defensive warnings were added in case any assumption was missed and the first warning assumed a correct application would not alter a mapping backing a futex key. Since merging, it has not triggered for any unexpected case but Mark Rutland reported the following bug triggering due to the first warning. kernel BUG at kernel/futex.c:679! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 3695 Comm: syz-executor1 Not tainted 4.13.0-rc3-00020-g307fec773ba3 #3 Hardware name: linux,dummy-virt (DT) task: ffff80001e271780 task.stack: ffff000010908000 PC is at get_futex_key+0x6a4/0xcf0 kernel/futex.c:679 LR is at get_futex_key+0x6a4/0xcf0 kernel/futex.c:679 pc : [] lr : [] pstate: 80000145 The fact that it's a bug instead of a warning was due to an unrelated arm64 problem, but the warning itself triggered because the underlying mapping changed. This is an application issue but from a kernel perspective it's a recoverable situation and the warning is unnecessary so this patch removes the warning. The warning may potentially be triggered with the following test program from Mark although it may be necessary to adjust NR_FUTEX_THREADS to be a value smaller than the number of CPUs in the system. #include #include #include #include #include #include #include #include #define NR_FUTEX_THREADS 16 pthread_t threads[NR_FUTEX_THREADS]; void *mem; #define MEM_PROT (PROT_READ | PROT_WRITE) #define MEM_SIZE 65536 static int futex_wrapper(int *uaddr, int op, int val, const struct timespec *timeout, int *uaddr2, int val3) { syscall(SYS_futex, uaddr, op, val, timeout, uaddr2, val3); } void *poll_futex(void *unused) { for (;;) { futex_wrapper(mem, FUTEX_CMP_REQUEUE_PI, 1, NULL, mem + 4, 1); } } int main(int argc, char *argv[]) { int i; mem = mmap(NULL, MEM_SIZE, MEM_PROT, MAP_SHARED | MAP_ANONYMOUS, -1, 0); printf("Mapping @ %p\n", mem); printf("Creating futex threads...\n"); for (i = 0; i < NR_FUTEX_THREADS; i++) pthread_create(&threads[i], NULL, poll_futex, NULL); printf("Flipping mapping...\n"); for (;;) { mmap(mem, MEM_SIZE, MEM_PROT, MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, -1, 0); } return 0; } Reported-and-tested-by: Mark Rutland Signed-off-by: Mel Gorman Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Linus Torvalds --- kernel/futex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 16dbe4c93895..f50b434756c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -670,13 +670,14 @@ again: * this reference was taken by ihold under the page lock * pinning the inode in place so i_lock was unnecessary. The * only way for this check to fail is if the inode was - * truncated in parallel so warn for now if this happens. + * truncated in parallel which is almost certainly an + * application bug. In such a case, just retry. * * We are not calling into get_futex_key_refs() in file-backed * cases, therefore a successful atomic_inc return below will * guarantee that get_futex_key() will still imply smp_mb(); (B). */ - if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + if (!atomic_inc_not_zero(&inode->i_count)) { rcu_read_unlock(); put_page(page); -- cgit v1.2.3 From 209887e6b974c22328487b55d0f390522b014b03 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 9 Aug 2017 10:21:46 +0530 Subject: cpufreq: Return 0 from ->fast_switch() on errors CPUFREQ_ENTRY_INVALID is a special symbol which is used to specify that an entry in the cpufreq table is invalid. But using it outside of the scope of the cpufreq table looks a bit incorrect. We can represent an invalid frequency by writing it as 0 instead if we need. Note that it is already done that way for the return value of the ->get() callback. Lets do the same for ->fast_switch() and not use CPUFREQ_ENTRY_INVALID outside of the scope of cpufreq table. Also update the comment over cpufreq_driver_fast_switch() to clearly mention what this returns. None of the drivers return CPUFREQ_ENTRY_INVALID as of now from ->fast_switch() callback and so we don't need to update any of those. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7dbc76801f86..2ba04bb3182a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -123,7 +123,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (policy->fast_switch_enabled) { next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (next_freq == CPUFREQ_ENTRY_INVALID) + if (!next_freq) return; policy->cur = next_freq; -- cgit v1.2.3 From 92b31a9af73b3a3fc801899335d6c47966351830 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Aug 2017 01:39:55 +0200 Subject: bpf: add BPF_J{LT,LE,SLT,SLE} instructions Currently, eBPF only understands BPF_JGT (>), BPF_JGE (>=), BPF_JSGT (s>), BPF_JSGE (s>=) instructions, this means that particularly *JLT/*JLE counterparts involving immediates need to be rewritten from e.g. X < [IMM] by swapping arguments into [IMM] > X, meaning the immediate first is required to be loaded into a register Y := [IMM], such that then we can compare with Y > X. Note that the destination operand is always required to be a register. This has the downside of having unnecessarily increased register pressure, meaning complex program would need to spill other registers temporarily to stack in order to obtain an unused register for the [IMM]. Loading to registers will thus also affect state pruning since we need to account for that register use and potentially those registers that had to be spilled/filled again. As a consequence slightly more stack space might have been used due to spilling, and BPF programs are a bit longer due to extra code involving the register load and potentially required spill/fills. Thus, add BPF_JLT (<), BPF_JLE (<=), BPF_JSLT (s<), BPF_JSLE (s<=) counterparts to the eBPF instruction set. Modifying LLVM to remove the NegateCC() workaround in a PoC patch at [1] and allowing it to also emit the new instructions resulted in cilium's BPF programs that are injected into the fast-path to have a reduced program length in the range of 2-3% (e.g. accumulated main and tail call sections from one of the object file reduced from 4864 to 4729 insns), reduced complexity in the range of 10-30% (e.g. accumulated sections reduced in one of the cases from 116432 to 88428 insns), and reduced stack usage in the range of 1-5% (e.g. accumulated sections from one of the object files reduced from 824 to 784b). The modification for LLVM will be incorporated in a backwards compatible way. Plan is for LLVM to have i) a target specific option to offer a possibility to explicitly enable the extension by the user (as we have with -m target specific extensions today for various CPU insns), and ii) have the kernel checked for presence of the extensions and enable them transparently when the user is selecting more aggressive options such as -march=native in a bpf target context. (Other frontends generating BPF byte code, e.g. ply can probe the kernel directly for its code generation.) [1] https://github.com/borkmann/llvm/tree/bpf-insns Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ad5f55922a13..c69e7f5bfde7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; @@ -833,12 +837,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, + [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, + [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, + [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, + [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, /* Program return */ @@ -1073,6 +1085,18 @@ out: CONT_JMP; } CONT; + JMP_JLT_X: + if (DST < SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JLT_K: + if (DST < IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JGE_X: if (DST >= SRC) { insn += insn->off; @@ -1085,6 +1109,18 @@ out: CONT_JMP; } CONT; + JMP_JLE_X: + if (DST <= SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JLE_K: + if (DST <= IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSGT_X: if (((s64) DST) > ((s64) SRC)) { insn += insn->off; @@ -1097,6 +1133,18 @@ out: CONT_JMP; } CONT; + JMP_JSLT_X: + if (((s64) DST) < ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSLT_K: + if (((s64) DST) < ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSGE_X: if (((s64) DST) >= ((s64) SRC)) { insn += insn->off; @@ -1109,6 +1157,18 @@ out: CONT_JMP; } CONT; + JMP_JSLE_X: + if (((s64) DST) <= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSLE_K: + if (((s64) DST) <= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSET_X: if (DST & SRC) { insn += insn->off; -- cgit v1.2.3 From b4e432f1000a171d901e42551459059831925770 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Aug 2017 01:40:02 +0200 Subject: bpf: enable BPF_J{LT, LE, SLT, SLE} opcodes in verifier Enable the newly added jump opcodes, main parts are in two different areas, namely direct packet access and dynamic map value access. For the direct packet access, we now allow for the following two new patterns to match in order to trigger markings with find_good_pkt_pointers(): Variant 1 (access ok when taking the branch): 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (bf) r0 = r2 3: (07) r0 += 8 4: (ad) if r0 < r3 goto pc+2 R0=pkt(id=0,off=8,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R10=fp 5: (b7) r0 = 0 6: (95) exit from 4 to 7: R0=pkt(id=0,off=8,r=8) R1=ctx R2=pkt(id=0,off=0,r=8) R3=pkt_end R10=fp 7: (71) r0 = *(u8 *)(r2 +0) 8: (05) goto pc-4 5: (b7) r0 = 0 6: (95) exit processed 11 insns, stack depth 0 Variant 2 (access ok on fall-through): 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (bf) r0 = r2 3: (07) r0 += 8 4: (bd) if r3 <= r0 goto pc+1 R0=pkt(id=0,off=8,r=8) R1=ctx R2=pkt(id=0,off=0,r=8) R3=pkt_end R10=fp 5: (71) r0 = *(u8 *)(r2 +0) 6: (b7) r0 = 1 7: (95) exit from 4 to 6: R0=pkt(id=0,off=8,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R10=fp 6: (b7) r0 = 1 7: (95) exit processed 10 insns, stack depth 0 The above two basically just swap the branches where we need to handle an exception and allow packet access compared to the two already existing variants for find_good_pkt_pointers(). For the dynamic map value access, we add the new instructions to reg_set_min_max() and reg_set_min_max_inv() in order to learn bounds. Verifier test cases for both are added in a follow-up patch. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8160a81a40bf..ecc590e01a1d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -312,11 +312,15 @@ static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", [BPF_JGT >> 4] = ">", + [BPF_JLT >> 4] = "<", [BPF_JGE >> 4] = ">=", + [BPF_JLE >> 4] = "<=", [BPF_JSET >> 4] = "&", [BPF_JNE >> 4] = "!=", [BPF_JSGT >> 4] = "s>", + [BPF_JSLT >> 4] = "s<", [BPF_JSGE >> 4] = "s>=", + [BPF_JSLE >> 4] = "s<=", [BPF_CALL >> 4] = "call", [BPF_EXIT >> 4] = "exit", }; @@ -2383,27 +2387,37 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, */ return; - /* LLVM can generate two kind of checks: + /* LLVM can generate four kind of checks: * - * Type 1: + * Type 1/2: * * r2 = r3; * r2 += 8; * if (r2 > pkt_end) goto * * + * r2 = r3; + * r2 += 8; + * if (r2 < pkt_end) goto + * + * * Where: * r2 == dst_reg, pkt_end == src_reg * r2=pkt(id=n,off=8,r=0) * r3=pkt(id=n,off=0,r=0) * - * Type 2: + * Type 3/4: * * r2 = r3; * r2 += 8; * if (pkt_end >= r2) goto * * + * r2 = r3; + * r2 += 8; + * if (pkt_end <= r2) goto + * + * * Where: * pkt_end == dst_reg, r2 == src_reg * r2=pkt(id=n,off=8,r=0) @@ -2471,6 +2485,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, false_reg->smax_value = min_t(s64, false_reg->smax_value, val); true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); break; + case BPF_JLT: + false_reg->umin_value = max(false_reg->umin_value, val); + true_reg->umax_value = min(true_reg->umax_value, val - 1); + break; + case BPF_JSLT: + false_reg->smin_value = max_t(s64, false_reg->smin_value, val); + true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); + break; case BPF_JGE: false_reg->umax_value = min(false_reg->umax_value, val - 1); true_reg->umin_value = max(true_reg->umin_value, val); @@ -2479,6 +2501,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); true_reg->smin_value = max_t(s64, true_reg->smin_value, val); break; + case BPF_JLE: + false_reg->umin_value = max(false_reg->umin_value, val + 1); + true_reg->umax_value = min(true_reg->umax_value, val); + break; + case BPF_JSLE: + false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); + true_reg->smax_value = min_t(s64, true_reg->smax_value, val); + break; default: break; } @@ -2527,6 +2557,14 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); false_reg->smin_value = max_t(s64, false_reg->smin_value, val); break; + case BPF_JLT: + true_reg->umin_value = max(true_reg->umin_value, val + 1); + false_reg->umax_value = min(false_reg->umax_value, val); + break; + case BPF_JSLT: + true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); + false_reg->smax_value = min_t(s64, false_reg->smax_value, val); + break; case BPF_JGE: true_reg->umax_value = min(true_reg->umax_value, val); false_reg->umin_value = max(false_reg->umin_value, val + 1); @@ -2535,6 +2573,14 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, true_reg->smax_value = min_t(s64, true_reg->smax_value, val); false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); break; + case BPF_JLE: + true_reg->umin_value = max(true_reg->umin_value, val); + false_reg->umax_value = min(false_reg->umax_value, val - 1); + break; + case BPF_JSLE: + true_reg->smin_value = max_t(s64, true_reg->smin_value, val); + false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); + break; default: break; } @@ -2659,7 +2705,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); int err; - if (opcode > BPF_EXIT) { + if (opcode > BPF_JSLE) { verbose("invalid BPF_JMP opcode %x\n", opcode); return -EINVAL; } @@ -2761,10 +2807,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { find_good_pkt_pointers(this_branch, dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + find_good_pkt_pointers(other_branch, dst_reg); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; -- cgit v1.2.3 From bfe334924ccd9f4a53f30240c03cf2f43f5b2df1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Aug 2017 19:39:30 +0200 Subject: perf/x86: Fix RDPMC vs. mm_struct tracking Vince reported the following rdpmc() testcase failure: > Failing test case: > > fd=perf_event_open(); > addr=mmap(fd); > exec() // without closing or unmapping the event > fd=perf_event_open(); > addr=mmap(fd); > rdpmc() // GPFs due to rdpmc being disabled The problem is of course that exec() plays tricks with what is current->mm, only destroying the old mappings after having installed the new mm. Fix this confusion by passing along vma->vm_mm instead of relying on current->mm. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 1e0fb9ec679c ("perf: Add pmu callbacks to track event mapping and unmapping") Link: http://lkml.kernel.org/r/20170802173930.cstykcqefmqt7jau@hirez.programming.kicks-ass.net [ Minor cleanups. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..a654b8a3586f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5090,7 +5090,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5113,7 +5113,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5411,7 +5411,7 @@ aux_unlock: vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); return ret; } -- cgit v1.2.3 From 9b231d9f47c6114d317ce28cff92a74ad80547f5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Aug 2017 15:42:09 +0200 Subject: perf/core: Fix time on IOC_ENABLE Vince reported that when we do IOC_ENABLE/IOC_DISABLE while the task is SIGSTOP'ed state the timestamps go wobbly. It turns out we indeed fail to correctly account time while in 'OFF' state and doing IOC_ENABLE without getting scheduled in exposes the problem. Further thinking about this problem, it occurred to me that we can suffer a similar fate when we migrate an uncore event between CPUs. The perf_event_install() on the 'new' CPU will do add_event_to_ctx() which will reset all the time stamp, resulting in a subsequent update_event_times() to overwrite the total_time_* fields with smaller values. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a654b8a3586f..ee20d4c546b5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2217,6 +2217,33 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +/* + * Complement to update_event_times(). This computes the tstamp_* values to + * continue 'enabled' state from @now, and effectively discards the time + * between the prior tstamp_stopped and now (as we were in the OFF state, or + * just switched (context) time base). + * + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and + * cannot have been scheduled in yet. And going into INACTIVE state means + * '@event->tstamp_stopped = @now'. + * + * Thus given the rules of update_event_times(): + * + * total_time_enabled = tstamp_stopped - tstamp_enabled + * total_time_running = tstamp_stopped - tstamp_running + * + * We can insert 'tstamp_stopped == now' and reverse them to compute new + * tstamp_* values. + */ +static void __perf_event_enable_time(struct perf_event *event, u64 now) +{ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); + + event->tstamp_stopped = now; + event->tstamp_enabled = now - event->total_time_enabled; + event->tstamp_running = now - event->total_time_running; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2224,9 +2251,12 @@ static void add_event_to_ctx(struct perf_event *event, list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; + /* + * We can be called with event->state == STATE_OFF when we create with + * .disabled = 1. In that case the IOC_ENABLE will call this function. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2471,10 +2501,11 @@ static void __perf_event_mark_enabled(struct perf_event *event) u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; + __perf_event_enable_time(event, tstamp); list_for_each_entry(sub, &event->sibling_list, group_entry) { + /* XXX should not be > INACTIVE if event isn't */ if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + __perf_event_enable_time(sub, tstamp); } } -- cgit v1.2.3 From fdccc3fb7a42ea4e4cd77d2fb8fa3a45c66ec0bf Mon Sep 17 00:00:00 2001 From: "leilei.lin" Date: Wed, 9 Aug 2017 08:29:21 +0800 Subject: perf/core: Reduce context switch overhead Skip most of the PMU context switching overhead when ctx->nr_events is 0. 50% performance overhead was observed under an extreme testcase. Signed-off-by: leilei.lin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: eranian@gmail.com Cc: jolsa@redhat.com Cc: linxiulei@gmail.com Cc: yang_oliver@hotmail.com Link: http://lkml.kernel.org/r/20170809002921.69813-1-leilei.lin@alibaba-inc.com [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ee20d4c546b5..d704e23914bf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3211,6 +3211,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, return; perf_ctx_lock(cpuctx, ctx); + /* + * We must check ctx->nr_events while holding ctx->lock, such + * that we serialize against perf_install_in_context(). + */ + if (!ctx->nr_events) + goto unlock; + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -3224,6 +3231,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); + +unlock: perf_ctx_unlock(cpuctx, ctx); } -- cgit v1.2.3 From a030d7381d8b3adabde724e3077bb6cb32d1b3ee Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 24 May 2017 10:59:52 +0530 Subject: sched/fair: Call cpufreq update util handlers less frequently on UP For SMP systems, update_load_avg() calls the cpufreq update util handlers only for the top level cfs_rq (i.e. rq->cfs). But that is not the case for UP systems. update_load_avg() calls util handler for any cfs_rq for which it is called. This would result in way too many calls from the scheduler to the cpufreq governors when CONFIG_FAIR_GROUP_SCHED is enabled. Reduce the frequency of these calls by copying the behavior from the SMP case, i.e. Only call util handlers for the top level cfs_rq. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: linaro-kernel@lists.linaro.org Fixes: 536bd00cdbb7 ("sched/fair: Fix !CONFIG_SMP kernel cpufreq governor breakage") Link: http://lkml.kernel.org/r/6abf69a2107525885b616a2c1ec03d9c0946171c.1495603536.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c95880e216f6..139abf2ae2a5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2790,6 +2790,29 @@ static inline void update_cfs_shares(struct sched_entity *se) } #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + if (&this_rq()->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_of(cfs_rq), 0); + } +} + #ifdef CONFIG_SMP /* * Approximate: @@ -3276,29 +3299,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) -{ - if (&this_rq()->cfs == cfs_rq) { - /* - * There are a few boundary cases this might miss but it should - * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. - * - * It will not get called when we go idle, because the idle - * thread is a different class (!fair), nor will the utilization - * number include things like RT tasks. - * - * As is, the util number is not freq-invariant (we'd have to - * implement arch_scale_freq_capacity() for that). - * - * See cpu_util(). - */ - cpufreq_update_util(rq_of(cfs_rq), 0); - } -} - /* * Unsigned subtract and clamp on underflow. * @@ -3544,7 +3544,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) static inline void update_load_avg(struct sched_entity *se, int not_used1) { - cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); + cfs_rq_util_change(cfs_rq_of(se)); } static inline void -- cgit v1.2.3 From 5b713a3d949bd7f8c615a335a8c40297586bc1b1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 24 May 2017 10:59:53 +0530 Subject: sched/core: Reuse put_prev_task() Reuse put_prev_task() instead of copying its implementation. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/e2e50578223d05c5e90a9feb964fe1ec5d09a052.1495603536.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0869b20fba81..835a234d35e8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5438,7 +5438,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) */ next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); - next->sched_class->put_prev_task(rq, next); + put_prev_task(rq, next); /* * Rules for changing task_struct::cpus_allowed are holding -- cgit v1.2.3 From c7132dd6f0e3b07bd4541cda9040897cc460d855 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 24 May 2017 10:59:54 +0530 Subject: sched/fair: Pass 'rq' to weighted_cpuload() weighted_cpuload() uses the cpu number passed to it get pointer to the runqueue. Almost all callers of weighted_cpuload() already have the rq pointer with them and can send that directly to weighted_cpuload(). In some cases the callers actually get the CPU number by doing cpu_of(rq). It would be simpler to pass rq to weighted_cpuload(). Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/b7720627e0576dc29b4ba3f9b6edbc913bb4f684.1495603536.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 139abf2ae2a5..27d425eccaad 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1378,7 +1378,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(const int cpu); +static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); @@ -1409,7 +1409,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) struct rq *rq = cpu_rq(cpu); ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); + ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); cpus++; @@ -5125,9 +5125,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, } /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static unsigned long weighted_cpuload(struct rq *rq) { - return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); + return cfs_rq_runnable_load_avg(&rq->cfs); } #ifdef CONFIG_NO_HZ_COMMON @@ -5172,7 +5172,7 @@ static void cpu_load_update_idle(struct rq *this_rq) /* * bail if there's load or we're actually up-to-date. */ - if (weighted_cpuload(cpu_of(this_rq))) + if (weighted_cpuload(this_rq)) return; cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); @@ -5193,7 +5193,7 @@ void cpu_load_update_nohz_start(void) * concurrently we'll exit nohz. And cpu_load write can race with * cpu_load_update_idle() but both updater would be writing the same. */ - this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); + this_rq->cpu_load[0] = weighted_cpuload(this_rq); } /* @@ -5209,7 +5209,7 @@ void cpu_load_update_nohz_stop(void) if (curr_jiffies == this_rq->last_load_update_tick) return; - load = weighted_cpuload(cpu_of(this_rq)); + load = weighted_cpuload(this_rq); rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); @@ -5235,7 +5235,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) */ void cpu_load_update_active(struct rq *this_rq) { - unsigned long load = weighted_cpuload(cpu_of(this_rq)); + unsigned long load = weighted_cpuload(this_rq); if (tick_nohz_tick_stopped()) cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); @@ -5253,7 +5253,7 @@ void cpu_load_update_active(struct rq *this_rq) static unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(rq); if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -5268,7 +5268,7 @@ static unsigned long source_load(int cpu, int type) static unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(rq); if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -5290,7 +5290,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(cpu); + unsigned long load_avg = weighted_cpuload(rq); if (nr_running) return load_avg / nr_running; @@ -5550,7 +5550,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(i); + load = weighted_cpuload(cpu_rq(i)); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; least_loaded_cpu = i; @@ -7363,7 +7363,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(i); + sgs->sum_weighted_load += weighted_cpuload(rq); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7892,7 +7892,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, capacity = capacity_of(i); - wl = weighted_cpuload(i); + wl = weighted_cpuload(rq); /* * When comparing with imbalance, use weighted_cpuload() -- cgit v1.2.3 From 9674f5cad22a590c865a330ce333026b9f9c078b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 24 May 2017 10:59:55 +0530 Subject: sched/fair: Avoid checking cfs_rq->nr_running twice Rearrange pick_next_task_fair() a bit to avoid checking cfs_rq->nr_running twice for the case where FAIR_GROUP_SCHED is enabled and the previous task doesn't belong to the fair class. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/000903ab3df3350943d3271c53615893a230dc95.1495603536.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27d425eccaad..30fd196c0cde 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6187,10 +6187,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: -#ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) goto idle; +#ifdef CONFIG_FAIR_GROUP_SCHED if (prev->sched_class != &fair_sched_class) goto simple; @@ -6220,11 +6220,17 @@ again: /* * This call to check_cfs_rq_runtime() will do the * throttle and dequeue its entity in the parent(s). - * Therefore the 'simple' nr_running test will indeed + * Therefore the nr_running test will indeed * be correct. */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) + if (unlikely(check_cfs_rq_runtime(cfs_rq))) { + cfs_rq = &rq->cfs; + + if (!cfs_rq->nr_running) + goto idle; + goto simple; + } } se = pick_next_entity(cfs_rq, curr); @@ -6264,12 +6270,8 @@ again: return p; simple: - cfs_rq = &rq->cfs; #endif - if (!cfs_rq->nr_running) - goto idle; - put_prev_task(rq, prev); do { -- cgit v1.2.3 From 3a123bbbb10d54dbdde6ccbbd519c74c91ba2f52 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 24 May 2017 10:59:56 +0530 Subject: sched/fair: Drop always true parameter of update_cfs_rq_load_avg() update_freq is always true and there is no need to pass it to update_cfs_rq_load_avg(). Remove it. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/2d28d295f3f591ede7e931462bce1bda5aaa4896.1495603536.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 30fd196c0cde..75c58c77450a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se) /* * For !fair tasks do: * - update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq); attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * @@ -3320,7 +3320,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_task() * @cfs_rq: cfs_rq to update - * @update_freq: should we call cfs_rq_util_change() or will the call do so * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) * avg. The immediate corollary is that all (fair) tasks must be attached, see @@ -3334,7 +3333,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} * call update_tg_load_avg() when this function returns true. */ static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { struct sched_avg *sa = &cfs_rq->avg; int decayed, removed_load = 0, removed_util = 0; @@ -3362,7 +3361,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (update_freq && (decayed || removed_util)) + if (decayed || removed_util) cfs_rq_util_change(cfs_rq); return decayed || removed_load; @@ -3390,7 +3389,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cpu, cfs_rq, se); - decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se); if (decayed && (flags & UPDATE_TG)) @@ -3534,7 +3533,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf); #else /* CONFIG_SMP */ static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { return 0; } @@ -6919,7 +6918,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0); /* Propagate pending load changes to the parent, if any: */ @@ -6992,7 +6991,7 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); rq_unlock_irqrestore(rq, &rf); } -- cgit v1.2.3 From 4d13a06d54c415238325b0fe2c14f1052da4512f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 13 Apr 2017 14:45:48 +0530 Subject: sched/topology: Drop memset() from init_rootdomain() There are only two callers of init_rootdomain(). One of them passes a global to it and another one sends dynamically allocated root-domain. There is no need to memset the root-domain in the first case as the structure is already reset. Update alloc_rootdomain() to allocate the memory with kzalloc() and remove the memset() call from init_rootdomain(). Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/fc2f6cc90b098040970c85a97046512572d765bc.1492065513.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 79895aec281e..216fee014b32 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) static int init_rootdomain(struct root_domain *rd) { - memset(rd, 0, sizeof(*rd)); - if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) @@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void) { struct root_domain *rd; - rd = kmalloc(sizeof(*rd), GFP_KERNEL); + rd = kzalloc(sizeof(*rd), GFP_KERNEL); if (!rd) return NULL; -- cgit v1.2.3 From 42d394d41ab90f4ed9d7a7403ed22e8f7590948a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 13 Apr 2017 14:45:49 +0530 Subject: sched/deadline: Don't re-initialize 'struct cpudl' The 'struct cpudl' passed to cpudl_init() is already initialized to zero. Don't do that again. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/bd4c229806bc96694b15546207afcc221387d2f5.1492065513.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index fba235c7d026..bdf448b6556f 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp) { int i; - memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; -- cgit v1.2.3 From 1c2a4861dbfca373fea1ff2cf9e9793933d024ce Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 13 Apr 2017 14:45:50 +0530 Subject: sched/cpupri: Don't re-initialize 'struct cpupri' The 'struct cpupri' passed to cpupri_init() is already initialized to zero. Don't do that again. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/8a71d48c5a077500b6ddc1a41484c0ac8d3aad94.1492065513.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..2511aba36b89 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp) { int i; - memset(cp, 0, sizeof(*cp)); - for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; -- cgit v1.2.3 From 181a80d1f7f453f58c4b47f89084d0849632858c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 27 Apr 2017 13:58:59 +0530 Subject: sched: Mark pick_next_task_dl() and build_sched_domain() as static pick_next_task_dl() and build_sched_domain() aren't used outside deadline.c and topology.c. Make them static. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/36e4cbb6210002cadae89920ae97e19e7e513008.1493281605.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- kernel/sched/topology.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 755bd3f1a1a9..a205ac7cf435 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct * +static struct task_struct * pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 216fee014b32..bd8b6d6f5387 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1593,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map) } } -struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, +static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { -- cgit v1.2.3 From f235a54f00449c611f85173fe8a66c4d189c5ce1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Sat, 1 Jul 2017 07:06:13 +0200 Subject: sched/pelt: Fix false running accounting The running state is a subset of runnable state which means that running can't be set if runnable (weight) is cleared. There are corner cases where the current sched_entity has been already dequeued but cfs_rq->curr has not been updated yet and still points to the dequeued sched_entity. If ___update_load_avg() is called at that time, weight will be 0 and running will be set which is not possible. This case happens during pick_next_task_fair() when a cfs_rq becomes idles. The current sched_entity has been dequeued so se->on_rq is cleared and cfs_rq->weight is null. But cfs_rq->curr still points to se (it will be cleared when picking the idle thread). Because the cfs_rq becomes idle, idle_balance() is called and ends up to call update_blocked_averages() with these wrong running and runnable states. Add a test in ___update_load_avg() to correct the running state in this case. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Link: http://lkml.kernel.org/r/1498885573-18984-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75c58c77450a..ef5b66b110f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2990,6 +2990,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, sa->last_update_time += delta << 10; + /* + * running is a subset of runnable (weight) so running can't be set if + * runnable is clear. But there are some corner cases where the current + * se has been already dequeued but cfs_rq->curr still points to it. + * This means that weight will be 0 but not running for a sched_entity + * but also for a cfs_rq if the latter becomes idle. As an example, + * this happens during idle_balance() which calls + * update_blocked_averages() + */ + if (!weight) + running = 0; + /* * Now we know we crossed measurement unit boundaries. The *_avg * accrues by two steps: -- cgit v1.2.3 From 37ec97deb3a8c68a7adfab61beb261ffeab19d09 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 31 Jul 2017 15:28:46 -0400 Subject: sched/numa: Slow down scan rate if shared faults dominate The comment above update_task_scan_period() says the scan period should be increased (scanning slows down) if the majority of memory accesses are on the local node, or if the majority of the page accesses are shared with other tasks. However, with the current code, all a high ratio of shared accesses does is slow down the rate at which scanning is made faster. This patch changes things so either lots of shared accesses or lots of local accesses will slow down scanning, and numa scanning is sped up only when there are lots of private faults on remote memory pages. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: lvenanci@redhat.com Link: http://lkml.kernel.org/r/20170731192847.23050-2-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef5b66b110f8..cb6b7c83b74d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1892,7 +1892,7 @@ static void update_task_scan_period(struct task_struct *p, unsigned long shared, unsigned long private) { unsigned int period_slot; - int ratio; + int lr_ratio, ps_ratio; int diff; unsigned long remote = p->numa_faults_locality[0]; @@ -1922,25 +1922,36 @@ static void update_task_scan_period(struct task_struct *p, * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) */ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); - ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); - if (ratio >= NUMA_PERIOD_THRESHOLD) { - int slot = ratio - NUMA_PERIOD_THRESHOLD; + lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); + + if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are local. There is no need to + * do fast NUMA scanning, since memory is already local. + */ + int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are shared with other tasks. + * There is no point in continuing fast NUMA scanning, + * since other tasks may just move the memory elsewhere. + */ + int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; if (!slot) slot = 1; diff = slot * period_slot; } else { - diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; - /* - * Scale scan rate increases based on sharing. There is an - * inverse relationship between the degree of sharing and - * the adjustment made to the scanning period. Broadly - * speaking the intent is that there is little point - * scanning faster if shared accesses dominate as it may - * simply bounce migrations uselessly + * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, + * yet they are not on the local NUMA node. Speed up + * NUMA scanning to get the memory moved over. */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); - diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + int ratio = max(lr_ratio, ps_ratio); + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; } p->numa_scan_period = clamp(p->numa_scan_period + diff, -- cgit v1.2.3 From b5dd77c8bdada7b6262d0cba02a6ed525bf4e6e1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 31 Jul 2017 15:28:47 -0400 Subject: sched/numa: Scale scan period with tasks in group and shared/private Running 80 tasks in the same group, or as threads of the same process, results in the memory getting scanned 80x as fast as it would be if a single task was using the memory. This really hurts some workloads. Scale the scan period by the number of tasks in the numa group, and the shared / private ratio, so the average rate at which memory in the group is scanned corresponds roughly to the rate at which a single task would scan its memory. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: lvenanci@redhat.com Link: http://lkml.kernel.org/r/20170731192847.23050-3-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 111 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 86 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cb6b7c83b74d..a7f1c3b797f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + int active_nodes; + + struct rcu_head rcu; + unsigned long total_faults; + unsigned long max_faults_cpu; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; + unsigned long faults[0]; +}; + +static inline unsigned long group_faults_priv(struct numa_group *ng); +static inline unsigned long group_faults_shared(struct numa_group *ng); + static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; @@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p) return max_t(unsigned int, floor, scan); } +static unsigned int task_scan_start(struct task_struct *p) +{ + unsigned long smin = task_scan_min(p); + unsigned long period = smin; + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + } + + return max(smin, period); +} + static unsigned int task_scan_max(struct task_struct *p) { - unsigned int smin = task_scan_min(p); - unsigned int smax; + unsigned long smin = task_scan_min(p); + unsigned long smax; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + unsigned long period = smax; + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + + smax = max(smax, period); + } + return max(smin, smax); } @@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } -struct numa_group { - atomic_t refcount; - - spinlock_t lock; /* nr_tasks, tasks */ - int nr_tasks; - pid_t gid; - int active_nodes; - - struct rcu_head rcu; - unsigned long total_faults; - unsigned long max_faults_cpu; - /* - * Faults_cpu is used to decide whether memory should move - * towards the CPU. As a consequence, these stats are weighted - * more by CPU use than by memory faults. - */ - unsigned long *faults_cpu; - unsigned long faults[0]; -}; - /* Shared or private faults. */ #define NR_NUMA_HINT_FAULT_TYPES 2 @@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +static inline unsigned long group_faults_priv(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + + return faults; +} + +static inline unsigned long group_faults_shared(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; + } + + return faults; +} + /* * A node triggering more than 1/3 as many NUMA faults as the maximum is * considered part of a numa group's pseudo-interleaving set. Migrations @@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p) * Reset the scan period if the task is being rescheduled on an * alternative node to recheck if the tasks is now properly placed. */ - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); @@ -2459,7 +2520,7 @@ void task_numa_work(struct callback_head *work) if (p->numa_scan_period == 0) { p->numa_scan_period_max = task_scan_max(p); - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); } next_scan = now + msecs_to_jiffies(p->numa_scan_period); @@ -2587,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now > curr->node_stamp + period) { if (!curr->node_stamp) - curr->numa_scan_period = task_scan_min(curr); + curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { -- cgit v1.2.3 From b18c3ca11c20caa4a397baa9b893ebc4aaa4fe9f Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 23 May 2017 11:00:56 +0900 Subject: sched/deadline: Make find_later_rq() choose a closer CPU in topology When cpudl_find() returns any among free_cpus, the CPU might not be closer than others, considering sched domain. For example: this_cpu: 15 free_cpus: 0, 1,..., 14 (== later_mask) best_cpu: 0 topology: 0 --+ +--+ 1 --+ | +-- ... --+ 2 --+ | | +--+ | 3 --+ | ... ... 12 --+ | +--+ | 13 --+ | | +-- ... -+ 14 --+ | +--+ 15 --+ In this case, it would be best to select 14 since it's a free CPU and closest to 15 (this_cpu). However, currently the code selects 0 (best_cpu) even though that's just any among free_cpus. Fix it. This (re)aligns the deadline behaviour with the rt behaviour. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1495504859-10960-2-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a205ac7cf435..ac07d7c3a978 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task) struct sched_domain *sd; struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); int this_cpu = smp_processor_id(); - int best_cpu, cpu = task_cpu(task); + int cpu = task_cpu(task); /* Make sure the mask is initialized first */ if (unlikely(!later_mask)) @@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task) * We have to consider system topology and task affinity * first, then we can look for a suitable cpu. */ - best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, - task, later_mask); - if (best_cpu == -1) + if (cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask) == -1) return -1; /* - * If we are here, some target has been found, - * the most suitable of which is cached in best_cpu. - * This is, among the runqueues where the current tasks - * have later deadlines than the task's one, the rq - * with the latest possible one. + * If we are here, some targets have been found, including + * the most suitable which is, among the runqueues where the + * current tasks have later deadlines than the task's one, the + * rq with the latest possible one. * * Now we check how well this matches with task's * affinity and system topology. @@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task) rcu_read_lock(); for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; /* * If possible, preempting this_cpu is @@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } + best_cpu = cpumask_first_and(later_mask, + sched_domain_span(sd)); /* - * Last chance: if best_cpu is valid and is - * in the mask, that becomes our choice. + * Last chance: if a cpu being in both later_mask + * and current sd span is valid, that becomes our + * choice. Of course, the latest possible cpu is + * already under consideration through later_mask. */ - if (best_cpu < nr_cpu_ids && - cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { + if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; } -- cgit v1.2.3 From 3261ed0b25098f92d36d5ad14524254d8c7fba54 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 23 May 2017 11:00:57 +0900 Subject: sched/deadline: Change return value of cpudl_find() cpudl_find() users are only interested in knowing if suitable CPU(s) were found or not (and then they look at later_mask to know which). Change cpudl_find() return type accordingly. Aligns with rt code. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1495504859-10960-3-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 26 +++++++++++++------------- kernel/sched/deadline.c | 6 +++--- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bdf448b6556f..8d9562d890d3 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp) * @p: the task * @later_mask: a mask to fill in with the selected CPUs (or NULL) * - * Returns: int - best CPU (heap maximum if suitable) + * Returns: int - CPUs were found */ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask) { - int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { - best_cpu = cpumask_any(later_mask); - goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && - dl_time_before(dl_se->deadline, cp->elements[0].dl)) { - best_cpu = cpudl_maximum(cp); - if (later_mask) - cpumask_set_cpu(best_cpu, later_mask); - } + return 1; + } else { + int best_cpu = cpudl_maximum(cp); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); -out: - WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); + if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); - return best_cpu; + return 1; + } + } + return 0; } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ac07d7c3a978..d05bd9457a40 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) + !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) return; /* @@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * see if it is pushed or pulled somewhere else. */ if (p->nr_cpus_allowed != 1 && - cpudl_find(&rq->rd->cpudl, p, NULL) != -1) + cpudl_find(&rq->rd->cpudl, p, NULL)) return; resched_curr(rq); @@ -1811,7 +1811,7 @@ static int find_later_rq(struct task_struct *task) * We have to consider system topology and task affinity * first, then we can look for a suitable cpu. */ - if (cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask) == -1) + if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) return -1; /* -- cgit v1.2.3 From 18f08dae19990f5fffde92e3a63e0d90cda0f1a8 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 4 Aug 2017 17:19:37 +0800 Subject: sched/core: Remove unnecessary initialization init_idle_bootup_task() init_idle_bootup_task( ) is called in rest_init( ) to switch the scheduling class of the boot thread to the idle class. the function only sets: idle->sched_class = &idle_sched_class; which has been set in init_idle() called by sched_init(): /* * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; We've already set the boot thread to idle class in start_kernel()->sched_init()->init_idle() so it's unnecessary to set it again in start_kernel()->rest_init()->init_idle_bootup_task() Signed-off-by: Cheng Jian Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1501838377-109720-1-git-send-email-cj.chengjian@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 835a234d35e8..6d91c10b1814 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5177,11 +5177,6 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } -void init_idle_bootup_task(struct task_struct *idle) -{ - idle->sched_class = &idle_sched_class; -} - /** * init_idle - set up an idle thread for a given CPU * @idle: task in question -- cgit v1.2.3 From 74dc3384fc7983b78cc46ebb1824968a3db85eb1 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sun, 6 Aug 2017 14:41:41 +1000 Subject: sched/debug: Use task_pid_nr_ns in /proc/$pid/sched It appears as though the addition of the PID namespace did not update the output code for /proc/*/sched, which resulted in it providing PIDs that were not self-consistent with the /proc mount. This additionally made it trivial to detect whether a process was inside &init_pid_ns from userspace, making container detection trivial: https://github.com/jessfraz/amicontained This leads to situations such as: % unshare -pmf % mount -t proc proc /proc % head -n1 /proc/1/sched head (10047, #threads: 1) Fix this by just using task_pid_nr_ns for the output of /proc/*/sched. All of the other uses of task_pid_nr in kernel/sched/debug.c are from a sysctl context and thus don't need to be namespaced. Signed-off-by: Aleksa Sarai Signed-off-by: Peter Zijlstra (Intel) Acked-by: Eric W. Biederman Cc: Jess Frazelle Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cyphar@cyphar.com Link: http://lkml.kernel.org/r/20170806044141.5093-1-asarai@suse.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4fa66de52bd6..ac345115877b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -872,11 +872,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) #endif } -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------" -- cgit v1.2.3 From e8c164954b926f06f109a42fb8595ed01275b141 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Mon, 7 Aug 2017 16:44:22 +0800 Subject: sched/debug: Show task state in /proc/sched_debug Currently we print the runnable task in /proc/sched_debug, but there is no task state information. We don't know which task is in the runqueue and which task is sleeping. Add task state in the runnable task list, like this: runnable tasks: S task PID tree-key switches prio wait-time sum-exec sum-sleep ----------------------------------------------------------------------------------------------------------- S watchdog/239 1452 -11.917445 2811 0 0.000000 8.949306 0.000000 7 0 / S migration/239 1453 20686.367740 8 0 0.000000 16215.720897 0.000000 7 0 / S ksoftirqd/239 1454 115383.841071 12 120 0.000000 0.200683 0.000000 7 0 / >R test 21287 4872.190970 407 120 0.000000 4874.911790 0.000000 7 0 /autogroup-150 R test 21288 4868.385454 401 120 0.000000 3672.341489 0.000000 7 0 /autogroup-150 R test 21289 4868.326776 384 120 0.000000 3424.934159 0.000000 7 0 /autogroup-150 Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1502095463-160172-2-git-send-email-xiexiuqi@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ac345115877b..d8d2ea215b85 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -421,13 +421,19 @@ static char *task_group_path(struct task_group *tg) } #endif +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { - if (rq->curr == p) - SEQ_printf(m, "R"); - else - SEQ_printf(m, " "); + unsigned long state; + + if (rq->curr == p) { + SEQ_printf(m, ">R"); + } else { + state = p->state ? __ffs(p->state) + 1 : 0; + SEQ_printf(m, " %c", state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); + } SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), @@ -456,9 +462,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key switches prio" + " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n" - "------------------------------------------------------" + "-------------------------------------------------------" "----------------------------------------------------\n"); rcu_read_lock(); -- cgit v1.2.3 From 20435d84e5f2041c64c792399ab6f2948a2c2252 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Mon, 7 Aug 2017 16:44:23 +0800 Subject: sched/debug: Intruduce task_state_to_char() helper function Now that we have more than one place to get the task state, intruduce the task_state_to_char() helper function to save some code. No functionality changed. Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1502095463-160172-3-git-send-email-xiexiuqi@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 15 ++++----------- kernel/sched/debug.c | 10 +++------- 2 files changed, 7 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d91c10b1814..f9f9948e2470 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5103,24 +5103,17 @@ out_unlock: return retval; } -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned long state = p->state; - - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); if (!try_get_task_stack(p)) return; - if (state) - state = __ffs(state) + 1; - printk(KERN_INFO "%-15.15s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); - if (state == TASK_RUNNING) + + printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + + if (p->state == TASK_RUNNING) printk(KERN_CONT " running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index d8d2ea215b85..cfd84f79e075 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -426,14 +426,10 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { - unsigned long state; - - if (rq->curr == p) { + if (rq->curr == p) SEQ_printf(m, ">R"); - } else { - state = p->state ? __ffs(p->state) + 1 : 0; - SEQ_printf(m, " %c", state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); - } + else + SEQ_printf(m, " %c", task_state_to_char(p)); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), -- cgit v1.2.3 From 35a2897c2a306cca344ca5c0b43416707018f434 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 15 Jun 2017 12:18:28 +0800 Subject: sched/wait: Remove the lockless swait_active() check in swake_up*() Steven Rostedt reported a potential race in RCU core because of swake_up(): CPU0 CPU1 ---- ---- __call_rcu_core() { spin_lock(rnp_root) need_wake = __rcu_start_gp() { rcu_start_gp_advanced() { gp_flags = FLAG_INIT } } rcu_gp_kthread() { swait_event_interruptible(wq, gp_flags & FLAG_INIT) { spin_lock(q->lock) *fetch wq->task_list here! * list_add(wq->task_list, q->task_list) spin_unlock(q->lock); *fetch old value of gp_flags here * spin_unlock(rnp_root) rcu_gp_kthread_wake() { swake_up(wq) { swait_active(wq) { list_empty(wq->task_list) } * return false * if (condition) * false * schedule(); In this case, a wakeup is missed, which could cause the rcu_gp_kthread waits for a long time. The reason of this is that we do a lockless swait_active() check in swake_up(). To fix this, we can either 1) add a smp_mb() in swake_up() before swait_active() to provide the proper order or 2) simply remove the swait_active() in swake_up(). The solution 2 not only fixes this problem but also keeps the swait and wait API as close as possible, as wake_up() doesn't provide a full barrier and doesn't do a lockless check of the wait queue either. Moreover, there are users already using swait_active() to do their quick checks for the wait queues, so it make less sense that swake_up() and swake_up_all() do this on their own. This patch then removes the lockless swait_active() check in swake_up() and swake_up_all(). Reported-by: Steven Rostedt Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Cc: Krister Johansen Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170615041828.zk3a3sfyudm5p6nl@tardis Signed-off-by: Ingo Molnar --- kernel/sched/swait.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 3d5610dcce11..2227e183e202 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q) { unsigned long flags; - if (!swait_active(q)) - return; - raw_spin_lock_irqsave(&q->lock, flags); swake_up_locked(q); raw_spin_unlock_irqrestore(&q->lock, flags); @@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q) struct swait_queue *curr; LIST_HEAD(tmp); - if (!swait_active(q)) - return; - raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { -- cgit v1.2.3 From 50972fe78f24f1cd0b9d7bbf1f87d2be9e4f412e Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Fri, 14 Jul 2017 19:17:56 +0530 Subject: locking/osq_lock: Fix osq_lock queue corruption Fix ordering of link creation between node->prev and prev->next in osq_lock(). A case in which the status of optimistic spin queue is CPU6->CPU2 in which CPU6 has acquired the lock. tail v ,-. <- ,-. |6| |2| `-' -> `-' At this point if CPU0 comes in to acquire osq_lock, it will update the tail count. CPU2 CPU0 ---------------------------------- tail v ,-. <- ,-. ,-. |6| |2| |0| `-' -> `-' `-' After tail count update if CPU2 starts to unqueue itself from optimistic spin queue, it will find an updated tail count with CPU0 and update CPU2 node->next to NULL in osq_wait_next(). unqueue-A tail v ,-. <- ,-. ,-. |6| |2| |0| `-' `-' `-' unqueue-B ->tail != curr && !node->next If reordering of following stores happen then prev->next where prev being CPU2 would be updated to point to CPU0 node: tail v ,-. <- ,-. ,-. |6| |2| |0| `-' `-' -> `-' osq_wait_next() node->next <- 0 xchg(node->next, NULL) tail v ,-. <- ,-. ,-. |6| |2| |0| `-' `-' `-' unqueue-C At this point if next instruction WRITE_ONCE(next->prev, prev); in CPU2 path is committed before the update of CPU0 node->prev = prev then CPU0 node->prev will point to CPU6 node. tail v----------. v ,-. <- ,-. ,-. |6| |2| |0| `-' `-' `-' `----------^ At this point if CPU0 path's node->prev = prev is committed resulting in change of CPU0 prev back to CPU2 node. CPU2 node->next is NULL currently, tail v ,-. <- ,-. <- ,-. |6| |2| |0| `-' `-' `-' `----------^ so if CPU0 gets into unqueue path of osq_lock it will keep spinning in infinite loop as condition prev->next == node will never be true. Signed-off-by: Prateek Sood [ Added pictures, rewrote comments. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sramana@codeaurora.org Link: http://lkml.kernel.org/r/1500040076-27626-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/locking/osq_lock.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index a3167941093b..a74ee6abd039 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; + + /* + * osq_lock() unqueue + * + * node->prev = prev osq_wait_next() + * WMB MB + * prev->next = node next->prev = prev // unqueue-C + * + * Here 'node->prev' and 'next->prev' are the same variable and we need + * to ensure these stores happen in-order to avoid corrupting the list. + */ + smp_wmb(); + WRITE_ONCE(prev->next, node); /* -- cgit v1.2.3 From 0aa1125fa8bc5e5f98317156728fa4d0293561a5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Jun 2017 21:02:12 +0300 Subject: locking/rwsem-spinlock: Add killable versions of __down_read() Rename __down_read() in __down_read_common() and teach it to abort waiting in case of pending signals and killable state argument passed. Note, that we shouldn't wake anybody up in EINTR path, as: We check for signal_pending_state() after (!waiter.task) test and under spinlock. So, current task wasn't able to be woken up. It may be in two cases: a writer is owner of the sem, or a writer is a first waiter of the sem. If a writer is owner of the sem, no one else may work with it in parallel. It will wake somebody, when it call up_write() or downgrade_write(). If a writer is the first waiter, it will be woken up, when the last active reader releases the sem, and sem->count became 0. Also note, that set_current_state() may be moved down to schedule() (after !waiter.task check), as all assignments in this type of semaphore (including wake_up), occur under spinlock, so we can't miss anything. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/149789533283.9059.9829416940494747182.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-spinlock.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 20819df98125..0848634c5512 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem) /* * get a read lock on the semaphore */ -void __sched __down_read(struct rw_semaphore *sem) +int __sched __down_read_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; unsigned long flags; @@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem) goto out; } - set_current_state(TASK_UNINTERRUPTIBLE); - /* set up my own style of waitqueue */ waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; @@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); - /* we don't need to touch the semaphore struct anymore */ - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - /* wait to be given the lock */ for (;;) { if (!waiter.task) break; + if (signal_pending_state(state, current)) + goto out_nolock; + set_current_state(state); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); + raw_spin_lock_irqsave(&sem->wait_lock, flags); } - __set_current_state(TASK_RUNNING); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); out: - ; + return 0; + +out_nolock: + /* + * We didn't take the lock, so that there is a writer, which + * is owner or the first waiter of the sem. If it's a waiter, + * it will be woken by current owner. Not need to wake anybody. + */ + list_del(&waiter.list); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + return -EINTR; +} + +void __sched __down_read(struct rw_semaphore *sem) +{ + __down_read_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_read_killable(struct rw_semaphore *sem) +{ + return __down_read_common(sem, TASK_KILLABLE); } /* -- cgit v1.2.3 From 83ced169d9a01f22eb39f1fcc1f89ad9d223238f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Jun 2017 21:02:26 +0300 Subject: locking/rwsem-xadd: Add killable versions of rwsem_down_read_failed() Rename rwsem_down_read_failed() in __rwsem_down_read_failed_common() and teach it to abort waiting in case of pending signals and killable state argument passed. Note, that we shouldn't wake anybody up in EINTR path, as: We check for (waiter.task) under spinlock before we go to out_nolock path. Current task wasn't able to be woken up, so there are a writer, owning the sem, or a writer, which is the first waiter. In the both cases we shouldn't wake anybody. If there is a writer, owning the sem, and we were the only waiter, remove RWSEM_WAITING_BIAS, as there are no waiters anymore. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/149789534632.9059.2901382369609922565.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 34e727f18e49..02f660666ab8 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, /* * Wait for the read lock to be granted */ -__visible -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; @@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* wait to be given the lock */ while (true) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); if (!waiter.task) break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } schedule(); } __set_current_state(TASK_RUNNING); return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_read_failed); +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + /* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the -- cgit v1.2.3 From 1dbb6704de91b169a58d0c8221624afd6a95cfc7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Aug 2017 17:24:04 +0200 Subject: jump_label: Fix concurrent static_key_enable/disable() static_key_enable/disable are trying to cap the static key count to 0/1. However, their use of key->enabled is outside jump_label_lock so they do not really ensure that. Rewrite them to do a quick check for an already enabled (respectively, already disabled), and then recheck under the jump label lock. Unlike static_key_slow_inc/dec, a failed check under the jump label lock does not modify key->enabled. Signed-off-by: Paolo Bonzini Signed-off-by: Peter Zijlstra (Intel) Cc: Eric Dumazet Cc: Jason Baron Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1501601046-35683-2-git-send-email-pbonzini@redhat.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 59 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index d11c506a6ac3..833eecae825e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,28 +79,6 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -void static_key_enable(struct static_key *key) -{ - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (!count) - static_key_slow_inc(key); -} -EXPORT_SYMBOL_GPL(static_key_enable); - -void static_key_disable(struct static_key *key) -{ - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (count) - static_key_slow_dec(key); -} -EXPORT_SYMBOL_GPL(static_key_disable); - void static_key_slow_inc(struct static_key *key) { int v, v1; @@ -139,6 +117,43 @@ void static_key_slow_inc(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_slow_inc); +void static_key_enable(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(); + if (atomic_read(&key->enabled) > 0) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 1); + return; + } + + cpus_read_lock(); + jump_label_lock(); + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); + jump_label_update(key); + atomic_set(&key->enabled, 1); + } + jump_label_unlock(); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable); + +void static_key_disable(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(); + if (atomic_read(&key->enabled) != 1) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 0); + return; + } + + cpus_read_lock(); + jump_label_lock(); + if (atomic_cmpxchg(&key->enabled, 1, 0)) + jump_label_update(key); + jump_label_unlock(); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable); + static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { -- cgit v1.2.3 From be040bea9085a9c2b1700c9e60888777baeb96d5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Aug 2017 17:24:06 +0200 Subject: cpuset: Make nr_cpusets private Any use of key->enabled (that is static_key_enabled and static_key_count) outside jump_label_lock should handle its own serialization. In the case of cpusets_enabled_key, the key is always incremented/decremented under cpuset_mutex, and hence the same rule applies to nr_cpusets. The rule *is* respected currently, but the mutex is static so nr_cpusets should be static too. Signed-off-by: Paolo Bonzini Signed-off-by: Peter Zijlstra (Intel) Acked-by: Zefan Li Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1501601046-35683-4-git-send-email-pbonzini@redhat.com Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8d5151688504..9ed6a051a1b9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -577,6 +577,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } +/* Must be called with cpuset_mutex held. */ +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + /* * generate_sched_domains() * -- cgit v1.2.3 From d0646a6f5533226ceb7620c20717286d3a372794 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Aug 2017 23:58:50 +0200 Subject: jump_label: Add RELEASE barrier after text changes In the unlikely case text modification does not fully order things, add some extra ordering of our own to ensure we only enabled the fast path after all text is visible. Signed-off-by: Peter Zijlstra (Intel) Cc: Jason Baron Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 833eecae825e..f2ea678c269f 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -108,7 +108,11 @@ void static_key_slow_inc(struct static_key *key) if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); - atomic_set(&key->enabled, 1); + /* + * Ensure that if the above cmpxchg loop observes our positive + * value, it must also observe all the text changes. + */ + atomic_set_release(&key->enabled, 1); } else { atomic_inc(&key->enabled); } @@ -130,7 +134,10 @@ void static_key_enable(struct static_key *key) if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); - atomic_set(&key->enabled, 1); + /* + * See static_key_slow_inc(). + */ + atomic_set_release(&key->enabled, 1); } jump_label_unlock(); cpus_read_unlock(); -- cgit v1.2.3 From b70cecf4b6b72a9977576ab32cca0e24f286f517 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Aug 2017 09:02:54 +0100 Subject: jump_label: Move CPU hotplug locking As we're about to rework the locking, let's move the taking and release of the CPU hotplug lock to locations that will make its reworking completely obvious. Signed-off-by: Marc Zyngier Signed-off-by: Peter Zijlstra (Intel) Cc: Leo Yan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20170801080257.5056-2-marc.zyngier@arm.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index f2ea678c269f..161301fff97d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -83,6 +83,7 @@ void static_key_slow_inc(struct static_key *key) { int v, v1; + cpus_read_lock(); STATIC_KEY_CHECK_USE(); /* @@ -99,11 +100,12 @@ void static_key_slow_inc(struct static_key *key) */ for (v = atomic_read(&key->enabled); v > 0; v = v1) { v1 = atomic_cmpxchg(&key->enabled, v, v + 1); - if (likely(v1 == v)) + if (likely(v1 == v)) { + cpus_read_unlock(); return; + } } - cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); -- cgit v1.2.3 From 8b7b412807053ab5f059ffae426a280e769a5bda Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Aug 2017 09:02:55 +0100 Subject: jump_label: Split out code under the hotplug lock In order to later introduce an "already locked" version of some of the static key funcions, let's split the code into the core stuff (the *_cpuslocked functions) and the usual helpers, which now take/release the hotplug lock and call into the _cpuslocked versions. Signed-off-by: Marc Zyngier Signed-off-by: Peter Zijlstra (Intel) Cc: Leo Yan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20170801080257.5056-3-marc.zyngier@arm.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 161301fff97d..cc6d815c75ed 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,11 +79,10 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -void static_key_slow_inc(struct static_key *key) +static void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; - cpus_read_lock(); STATIC_KEY_CHECK_USE(); /* @@ -100,10 +99,8 @@ void static_key_slow_inc(struct static_key *key) */ for (v = atomic_read(&key->enabled); v > 0; v = v1) { v1 = atomic_cmpxchg(&key->enabled, v, v + 1); - if (likely(v1 == v)) { - cpus_read_unlock(); + if (likely(v1 == v)) return; - } } jump_label_lock(); @@ -119,6 +116,12 @@ void static_key_slow_inc(struct static_key *key) atomic_inc(&key->enabled); } jump_label_unlock(); +} + +void static_key_slow_inc(struct static_key *key) +{ + cpus_read_lock(); + static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -163,10 +166,10 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static void __static_key_slow_dec(struct static_key *key, - unsigned long rate_limit, struct delayed_work *work) +static void static_key_slow_dec_cpuslocked(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) { - cpus_read_lock(); /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -177,7 +180,6 @@ static void __static_key_slow_dec(struct static_key *key, if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); - cpus_read_unlock(); return; } @@ -188,6 +190,14 @@ static void __static_key_slow_dec(struct static_key *key, jump_label_update(key); } jump_label_unlock(); +} + +static void __static_key_slow_dec(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) +{ + cpus_read_lock(); + static_key_slow_dec_cpuslocked(key, rate_limit, work); cpus_read_unlock(); } -- cgit v1.2.3 From 5a40527f8f0798553764fc8db4111d7d9c33ea51 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Aug 2017 09:02:56 +0100 Subject: jump_label: Provide hotplug context variants As using the normal static key API under the hotplug lock is pretty much impossible, let's provide a variant of some of them that require the hotplug lock to have already been taken. These function are only meant to be used in CPU hotplug callbacks. Signed-off-by: Marc Zyngier Signed-off-by: Peter Zijlstra (Intel) Cc: Leo Yan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/20170801080257.5056-4-marc.zyngier@arm.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index cc6d815c75ed..0bf2e8f5244a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -126,15 +126,15 @@ void static_key_slow_inc(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_slow_inc); -void static_key_enable(struct static_key *key) +void static_key_enable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(); + if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } - cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); @@ -145,23 +145,37 @@ void static_key_enable(struct static_key *key) atomic_set_release(&key->enabled, 1); } jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); + +void static_key_enable(struct static_key *key) +{ + cpus_read_lock(); + static_key_enable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable); -void static_key_disable(struct static_key *key) +void static_key_disable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(); + if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } - cpus_read_lock(); jump_label_lock(); if (atomic_cmpxchg(&key->enabled, 1, 0)) jump_label_update(key); jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); + +void static_key_disable(struct static_key *key) +{ + cpus_read_lock(); + static_key_disable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable); -- cgit v1.2.3 From d89e588ca4081615216cc25f2489b0281ac0bfe9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Sep 2016 11:37:53 +0200 Subject: locking: Introduce smp_mb__after_spinlock() Since its inception, our understanding of ACQUIRE, esp. as applied to spinlocks, has changed somewhat. Also, I wonder if, with a simple change, we cannot make it provide more. The problem with the comment is that the STORE done by spin_lock isn't itself ordered by the ACQUIRE, and therefore a later LOAD can pass over it and cross with any prior STORE, rendering the default WMB insufficient (pointed out by Alan). Now, this is only really a problem on PowerPC and ARM64, both of which already defined smp_mb__before_spinlock() as a smp_mb(). At the same time, we can get a much stronger construct if we place that same barrier _inside_ the spin_lock(). In that case we upgrade the RCpc spinlock to an RCsc. That would make all schedule() calls fully transitive against one another. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Alan Stern Cc: Benjamin Herrenschmidt Cc: Linus Torvalds Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Paul McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0869b20fba81..9fece583a1f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1967,8 +1967,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * reordered with p->state check below. This pairs with mb() in * set_current_state() the waiting thread does. */ - smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); + smp_mb__after_spinlock(); if (!(p->state & state)) goto out; @@ -3281,8 +3281,8 @@ static void __sched notrace __schedule(bool preempt) * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ - smp_mb__before_spinlock(); rq_lock(rq, &rf); + smp_mb__after_spinlock(); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; -- cgit v1.2.3 From d92a8cfcb37ecd1315269dab741f073b63b3a8b6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Mar 2017 10:13:38 +0100 Subject: locking/lockdep: Rework FS_RECLAIM annotation A while ago someone, and I cannot find the email just now, asked if we could not implement the RECLAIM_FS inversion stuff with a 'fake' lock like we use for other things like workqueues etc. I think this should be possible which allows reducing the 'irq' states and will reduce the amount of __bfs() lookups we do. Removing the 1 IRQ state results in 4 less __bfs() walks per dependency, improving lockdep performance. And by moving this annotation out of the lockdep code it becomes easier for the mm people to extend. Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Mel Gorman Cc: Michal Hocko Cc: Nikolay Borisov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: iamjoonsoo.kim@lge.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 95 +---------------------------------------- kernel/locking/lockdep_states.h | 1 - 2 files changed, 1 insertion(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7d2499bec5fe..986f2fa79dbb 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -344,14 +344,12 @@ EXPORT_SYMBOL(lockdep_on); #if VERBOSE # define HARDIRQ_VERBOSE 1 # define SOFTIRQ_VERBOSE 1 -# define RECLAIM_VERBOSE 1 #else # define HARDIRQ_VERBOSE 0 # define SOFTIRQ_VERBOSE 0 -# define RECLAIM_VERBOSE 0 #endif -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE /* * Quick filtering for interesting events: */ @@ -2567,14 +2565,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -static int RECLAIM_FS_verbose(struct lock_class *class) -{ -#if RECLAIM_VERBOSE - return class_filter(class); -#endif - return 0; -} - #define STRICT_READ_CHECKS 1 static int (*state_verbose_f[])(struct lock_class *class) = { @@ -2870,57 +2860,6 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } -static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks)) - return; - - gfp_mask = current_gfp_context(gfp_mask); - - /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) - return; - - /* this guy won't enter reclaim */ - if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) - return; - - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) - return; - - /* - * Oi! Can't be having __GFP_FS allocations with IRQs disabled. - */ - if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) - return; - - /* Disable lockdep if explicitly requested */ - if (gfp_mask & __GFP_NOLOCKDEP) - return; - - mark_held_locks(curr, RECLAIM_FS); -} - -static void check_flags(unsigned long flags); - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lockdep_trace_alloc(gfp_mask, flags); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} - static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) { /* @@ -2966,22 +2905,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } - /* - * We reuse the irq context infrastructure more broadly as a general - * context checking code. This tests GFP_FS recursion (a lock taken - * during reclaim for a GFP_FS allocation is held over a GFP_FS - * allocation). - */ - if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { - if (hlock->read) { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) - return 0; - } - } - return 1; } @@ -3040,10 +2963,6 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } -void lockdep_trace_alloc(gfp_t gfp_mask) -{ -} - #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ /* @@ -3952,18 +3871,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) } EXPORT_SYMBOL_GPL(lock_unpin_lock); -void lockdep_set_current_reclaim_state(gfp_t gfp_mask) -{ - current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); -} -EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); - -void lockdep_clear_current_reclaim_state(void) -{ - current->lockdep_reclaim_gfp = 0; -} -EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state); - #ifdef CONFIG_LOCK_STAT static int print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc2b84c..35ca09f2ed0b 100644 --- a/kernel/locking/lockdep_states.h +++ b/kernel/locking/lockdep_states.h @@ -6,4 +6,3 @@ */ LOCKDEP_STATE(HARDIRQ) LOCKDEP_STATE(SOFTIRQ) -LOCKDEP_STATE(RECLAIM_FS) -- cgit v1.2.3 From ae813308f4630642d2c1c87553929ce95f29f9ef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Mar 2017 10:13:38 +0100 Subject: locking/lockdep: Avoid creating redundant links Two boots + a make defconfig, the first didn't have the redundant bit in, the second did: lock-classes: 1168 1169 [max: 8191] direct dependencies: 7688 5812 [max: 32768] indirect dependencies: 25492 25937 all direct dependencies: 220113 217512 dependency chains: 9005 9008 [max: 65536] dependency chain hlocks: 34450 34366 [max: 327680] in-hardirq chains: 55 51 in-softirq chains: 371 378 in-process chains: 8579 8579 stack-trace entries: 108073 88474 [max: 524288] combined max dependencies: 178738560 169094640 max locking depth: 15 15 max bfs queue depth: 320 329 cyclic checks: 9123 9190 redundant checks: 5046 redundant links: 1828 find-mask forwards checks: 2564 2599 find-mask backwards checks: 39521 39789 So it saves nearly 2k links and a fair chunk of stack-trace entries, but as expected, makes no real difference on the indirect dependencies. At the same time, you see the max BFS depth increase, which is also expected, although it could easily be boot variance -- these numbers are not entirely stable between boots. The down side is that the cycles in the graph become larger and thus the reports harder to read. XXX: do we want this as a CONFIG variable, implied by LOCKDEP_SMALL? Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Mel Gorman Cc: Michal Hocko Cc: Nikolay Borisov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: iamjoonsoo.kim@lge.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Link: http://lkml.kernel.org/r/20170303091338.GH6536@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 27 +++++++++++++++++++++++++++ kernel/locking/lockdep_internals.h | 2 ++ kernel/locking/lockdep_proc.c | 4 ++++ 3 files changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 986f2fa79dbb..b2dd313951ce 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1307,6 +1307,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target, return result; } +static noinline int +check_redundant(struct lock_list *root, struct lock_class *target, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_redundant_checks); + + result = __bfs_forwards(root, target, class_equal, target_entry); + + return result; +} + #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of @@ -1872,6 +1885,20 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } + /* + * Is the -> link redundant? + */ + this.class = hlock_class(prev); + this.parent = NULL; + ret = check_redundant(&this, hlock_class(next), &target_entry); + if (!ret) { + debug_atomic_inc(nr_redundant); + return 2; + } + if (ret < 0) + return print_bfs_bug(ret); + + if (!*stack_saved) { if (!save_trace(&trace)) return 0; diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c08fbd2f5ba9..1da4669d57a7 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -143,6 +143,8 @@ struct lockdep_stats { int redundant_softirqs_on; int redundant_softirqs_off; int nr_unused_locks; + int nr_redundant_checks; + int nr_redundant; int nr_cyclic_checks; int nr_cyclic_check_recursions; int nr_find_usage_forwards_checks; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6d1fcc786081..68d9e267ccd4 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m) debug_atomic_read(chain_lookup_hits)); seq_printf(m, " cyclic checks: %11llu\n", debug_atomic_read(nr_cyclic_checks)); + seq_printf(m, " redundant checks: %11llu\n", + debug_atomic_read(nr_redundant_checks)); + seq_printf(m, " redundant links: %11llu\n", + debug_atomic_read(nr_redundant)); seq_printf(m, " find-mask forwards checks: %11llu\n", debug_atomic_read(nr_find_usage_forwards_checks)); seq_printf(m, " find-mask backwards checks: %11llu\n", -- cgit v1.2.3 From 545c23f2e954eb3365629b20ceeef4eadb1ff97f Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:48 +0900 Subject: locking/lockdep: Refactor lookup_chain_cache() Currently, lookup_chain_cache() provides both 'lookup' and 'add' functionalities in a function. However, each is useful. So this patch makes lookup_chain_cache() only do 'lookup' functionality and makes add_chain_cahce() only do 'add' functionality. And it's more readable than before. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-2-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 141 +++++++++++++++++++++++++++++++---------------- 1 file changed, 93 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b2dd313951ce..e029f2f3b8dc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2151,20 +2151,26 @@ static int check_no_collision(struct task_struct *curr, } /* - * Look up a dependency chain. If the key is not present yet then - * add it and return 1 - in this case the new dependency chain is - * validated. If the key is already hashed, return 0. - * (On return with 1 graph_lock is held.) + * Adds a dependency chain into chain hashtable. And must be called with + * graph_lock held. + * + * Return 0 if fail, and graph_lock is released. + * Return 1 if succeed, with graph_lock held. */ -static inline int lookup_chain_cache(struct task_struct *curr, - struct held_lock *hlock, - u64 chain_key) +static inline int add_chain_cache(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) { struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; int i, j; + /* + * Allocate a new chain entry from the static array, and add + * it to the hash: + */ + /* * We might need to take the graph lock, ensure we've got IRQs * disabled to make this an IRQ-safe lock.. for recursion reasons @@ -2172,43 +2178,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - /* - * We can walk it lock-free, because entries only get added - * to the hash: - */ - hlist_for_each_entry_rcu(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { -cache_hit: - debug_atomic_inc(chain_lookup_hits); - if (!check_no_collision(curr, hlock, chain)) - return 0; - if (very_verbose(class)) - printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, - class->key, class->name); - return 0; - } - } - if (very_verbose(class)) - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, class->key, class->name); - /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - if (!graph_lock()) - return 0; - /* - * We have to walk the chain again locked - to avoid duplicates: - */ - hlist_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { - graph_unlock(); - goto cache_hit; - } - } if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2260,6 +2230,78 @@ cache_hit: return 1; } +/* + * Look up a dependency chain. + */ +static inline struct lock_chain *lookup_chain_cache(u64 chain_key) +{ + struct hlist_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + + /* + * We can walk it lock-free, because entries only get added + * to the hash: + */ + hlist_for_each_entry_rcu(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { + debug_atomic_inc(chain_lookup_hits); + return chain; + } + } + return NULL; +} + +/* + * If the key is not present yet in dependency chain cache then + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) + */ +static inline int lookup_chain_cache_add(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct lock_chain *chain = lookup_chain_cache(chain_key); + + if (chain) { +cache_hit: + if (!check_no_collision(curr, hlock, chain)) + return 0; + + if (very_verbose(class)) { + printk("\nhash chain already cached, key: " + "%016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, + class->key, class->name); + } + + return 0; + } + + if (very_verbose(class)) { + printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, class->key, class->name); + } + + if (!graph_lock()) + return 0; + + /* + * We have to walk the chain again locked - to avoid duplicates: + */ + chain = lookup_chain_cache(chain_key); + if (chain) { + graph_unlock(); + goto cache_hit; + } + + if (!add_chain_cache(curr, hlock, chain_key)) + return 0; + + return 1; +} + static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, struct held_lock *hlock, int chain_head, u64 chain_key) { @@ -2270,11 +2312,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * * We look up the chain_key and do the O(N^2) check and update of * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires + * (If lookup_chain_cache_add() return with 1 it acquires * graph_lock for us) */ if (!hlock->trylock && hlock->check && - lookup_chain_cache(curr, hlock, chain_key)) { + lookup_chain_cache_add(curr, hlock, chain_key)) { /* * Check whether last held lock: * @@ -2302,14 +2344,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * Add dependency only if this lock is not the head * of the chain, and if it's not a secondary read-lock: */ - if (!chain_head && ret != 2) + if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) return 0; + } + graph_unlock(); - } else - /* after lookup_chain_cache(): */ + } else { + /* after lookup_chain_cache_add(): */ if (unlikely(!debug_locks)) return 0; + } return 1; } -- cgit v1.2.3 From 49347a986ab45eb1dafbf25170647c890f8ff192 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:49 +0900 Subject: locking/lockdep: Add a function building a chain between two classes Crossrelease needs to build a chain between two classes regardless of their contexts. However, add_chain_cache() cannot be used for that purpose since it assumes that it's called in the acquisition context of the hlock. So this patch introduces a new function doing it. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-3-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e029f2f3b8dc..bdf6b31f702b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2150,6 +2150,76 @@ static int check_no_collision(struct task_struct *curr, return 1; } +/* + * This is for building a chain between just two different classes, + * instead of adding a new hlock upon current, which is done by + * add_chain_cache(). + * + * This can be called in any context with two classes, while + * add_chain_cache() must be done within the lock owener's context + * since it uses hlock which might be racy in another context. + */ +static inline int add_chain_cache_classes(unsigned int prev, + unsigned int next, + unsigned int irq_context, + u64 chain_key) +{ + struct hlist_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + + /* + * Allocate a new chain entry from the static array, and add + * it to the hash: + */ + + /* + * We might need to take the graph lock, ensure we've got IRQs + * disabled to make this an IRQ-safe lock.. for recursion reasons + * lockdep won't complain about its own locking errors. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); + dump_stack(); + return 0; + } + + chain = lock_chains + nr_lock_chains++; + chain->chain_key = chain_key; + chain->irq_context = irq_context; + chain->depth = 2; + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = nr_chain_hlocks; + nr_chain_hlocks += chain->depth; + chain_hlocks[chain->base] = prev - 1; + chain_hlocks[chain->base + 1] = next -1; + } +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * Important for check_no_collision(). + */ + else { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); + dump_stack(); + return 0; + } +#endif + + hlist_add_head_rcu(&chain->entry, hash_head); + debug_atomic_inc(chain_lookup_misses); + inc_chains(); + + return 1; +} + /* * Adds a dependency chain into chain hashtable. And must be called with * graph_lock held. -- cgit v1.2.3 From 70911fdc9576f4eeb3986689a1c9a778a4a4aacb Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:50 +0900 Subject: locking/lockdep: Change the meaning of check_prev_add()'s return value Firstly, return 1 instead of 2 when 'prev -> next' dependency already exists. Since the value 2 is not referenced anywhere, just return 1 indicating success in this case. Secondly, return 2 instead of 1 when successfully added a lock_list entry with saving stack_trace. With that, a caller can decide whether to avoid redundant save_trace() on the caller site. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-4-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bdf6b31f702b..7cf02fab1725 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1881,7 +1881,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 2; + return 1; } } @@ -1935,9 +1935,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, print_lock_name(hlock_class(next)); printk(KERN_CONT "\n"); dump_stack(); - return graph_lock(); + if (!graph_lock()) + return 0; } - return 1; + return 2; } /* -- cgit v1.2.3 From ce07a9415f266e181a0a33033a5f7138760240a4 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:51 +0900 Subject: locking/lockdep: Make check_prev_add() able to handle external stack_trace Currently, a space for stack_trace is pinned in check_prev_add(), that makes us not able to use external stack_trace. The simplest way to achieve it is to pass an external stack_trace as an argument. A more suitable solution is to pass a callback additionally along with a stack_trace so that callers can decide the way to save or whether to save. Actually crossrelease needs to do other than saving a stack_trace. So pass a stack_trace and callback to handle it, to check_prev_add(). Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-5-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7cf02fab1725..841828ba35b9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1824,20 +1824,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int *stack_saved) + struct held_lock *next, int distance, struct stack_trace *trace, + int (*save)(struct stack_trace *trace)) { struct lock_list *entry; int ret; struct lock_list this; struct lock_list *uninitialized_var(target_entry); - /* - * Static variable, serialized by the graph_lock(). - * - * We use this static variable to save the stack trace in case - * we call into this function multiple times due to encountering - * trylocks in the held lock stack. - */ - static struct stack_trace trace; /* * Prove that the new -> dependency would not @@ -1899,11 +1892,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return print_bfs_bug(ret); - if (!*stack_saved) { - if (!save_trace(&trace)) - return 0; - *stack_saved = 1; - } + if (save && !save(trace)) + return 0; /* * Ok, all validations passed, add the new lock @@ -1911,14 +1901,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, &trace); + next->acquire_ip, distance, trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), &hlock_class(next)->locks_before, - next->acquire_ip, distance, &trace); + next->acquire_ip, distance, trace); if (!ret) return 0; @@ -1926,8 +1916,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - /* We drop graph lock, so another thread can overwrite trace. */ - *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1951,8 +1939,9 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int stack_saved = 0; struct held_lock *hlock; + struct stack_trace trace; + int (*save)(struct stack_trace *trace) = save_trace; /* * Debugging checks. @@ -1977,9 +1966,18 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) * added: */ if (hlock->read != 2 && hlock->check) { - if (!check_prev_add(curr, hlock, next, - distance, &stack_saved)) + int ret = check_prev_add(curr, hlock, next, + distance, &trace, save); + if (!ret) return 0; + + /* + * Stop saving stack_trace if save_trace() was + * called at least once: + */ + if (save && ret == 2) + save = NULL; + /* * Stop after the first non-trylock entry, * as non-trylock entries have added their -- cgit v1.2.3 From b09be676e0ff25bd6d2e7637e26d349f9109ad75 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:52 +0900 Subject: locking/lockdep: Implement the 'crossrelease' feature Lockdep is a runtime locking correctness validator that detects and reports a deadlock or its possibility by checking dependencies between locks. It's useful since it does not report just an actual deadlock but also the possibility of a deadlock that has not actually happened yet. That enables problems to be fixed before they affect real systems. However, this facility is only applicable to typical locks, such as spinlocks and mutexes, which are normally released within the context in which they were acquired. However, synchronization primitives like page locks or completions, which are allowed to be released in any context, also create dependencies and can cause a deadlock. So lockdep should track these locks to do a better job. The 'crossrelease' implementation makes these primitives also be tracked. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-6-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/exit.c | 1 + kernel/fork.c | 4 + kernel/locking/lockdep.c | 508 ++++++++++++++++++++++++++++++++++++++++++++--- kernel/workqueue.c | 2 + 4 files changed, 492 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..fa72d57db747 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -920,6 +920,7 @@ void __noreturn do_exit(long code) exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); + lockdep_free_task(tsk); do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/fork.c b/kernel/fork.c index 17921b0390b4..cbf2221ee81a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -484,6 +484,8 @@ void __init fork_init(void) cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); #endif + + lockdep_init_task(&init_task); } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -1691,6 +1693,7 @@ static __latent_entropy struct task_struct *copy_process( p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; + lockdep_init_task(p); #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -1949,6 +1952,7 @@ bad_fork_cleanup_audit: bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_cleanup_policy: + lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 841828ba35b9..56f69cc53ddc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -58,6 +58,10 @@ #define CREATE_TRACE_POINTS #include +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +#include +#endif + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -724,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +static void cross_init(struct lockdep_map *lock, int cross); +static int cross_lock(struct lockdep_map *lock); +static int lock_acquire_crosslock(struct held_lock *hlock); +static int lock_release_crosslock(struct lockdep_map *lock); +#else +static inline void cross_init(struct lockdep_map *lock, int cross) {} +static inline int cross_lock(struct lockdep_map *lock) { return 0; } +static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } +static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } +#endif + /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1795,6 +1811,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; + if (cross_lock(prev->instance)) + continue; + return print_deadlock_bug(curr, prev, next); } return 1; @@ -1962,30 +1981,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; /* - * Only non-recursive-read entries get new dependencies - * added: + * Only non-crosslock entries get new dependencies added. + * Crosslock entries will be added by commit later: */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, - distance, &trace, save); - if (!ret) - return 0; - + if (!cross_lock(hlock->instance)) { /* - * Stop saving stack_trace if save_trace() was - * called at least once: + * Only non-recursive-read entries get new dependencies + * added: */ - if (save && ret == 2) - save = NULL; + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, + distance, &trace, save); + if (!ret) + return 0; - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; + /* + * Stop saving stack_trace if save_trace() was + * called at least once: + */ + if (save && ret == 2) + save = NULL; + + /* + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: + */ + if (!hlock->trylock) + break; + } } depth--; /* @@ -3176,7 +3201,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, +static void __lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { int i; @@ -3234,8 +3259,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, raw_local_irq_restore(flags); } } + +void lockdep_init_map(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + cross_init(lock, 0); + __lockdep_init_map(lock, name, key, subclass); +} EXPORT_SYMBOL_GPL(lockdep_init_map); +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + cross_init(lock, 1); + __lockdep_init_map(lock, name, key, subclass); +} +EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); +#endif + struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3291,6 +3333,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; + int ret; if (unlikely(!debug_locks)) return 0; @@ -3339,7 +3382,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - if (depth) { + /* TODO: nest_lock is not implemented for crosslock yet. */ + if (depth && !cross_lock(lock)) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3427,6 +3471,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; + ret = lock_acquire_crosslock(hlock); + /* + * 2 means normal acquire operations are needed. Otherwise, it's + * ok just to return with '0:fail, 1:success'. + */ + if (ret != 2) + return ret; + curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3664,11 +3716,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int i; + int ret, i; if (unlikely(!debug_locks)) return 0; + ret = lock_release_crosslock(lock); + /* + * 2 means normal release operations are needed. Otherwise, it's + * ok just to return with '0:fail, 1:success'. + */ + if (ret != 2) + return ret; + depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -4532,6 +4592,13 @@ asmlinkage __visible void lockdep_sys_exit(void) curr->comm, curr->pid); lockdep_print_held_locks(curr); } + + /* + * The lock history for each syscall should be independent. So wipe the + * slate clean on return to userspace. + */ + crossrelease_hist_end(XHLOCK_PROC); + crossrelease_hist_start(XHLOCK_PROC); } void lockdep_rcu_suspicious(const char *file, const int line, const char *s) @@ -4580,3 +4647,398 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE + +/* + * Crossrelease works by recording a lock history for each thread and + * connecting those historic locks that were taken after the + * wait_for_completion() in the complete() context. + * + * Task-A Task-B + * + * mutex_lock(&A); + * mutex_unlock(&A); + * + * wait_for_completion(&C); + * lock_acquire_crosslock(); + * atomic_inc_return(&cross_gen_id); + * | + * | mutex_lock(&B); + * | mutex_unlock(&B); + * | + * | complete(&C); + * `-- lock_commit_crosslock(); + * + * Which will then add a dependency between B and C. + */ + +#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) + +/* + * Whenever a crosslock is held, cross_gen_id will be increased. + */ +static atomic_t cross_gen_id; /* Can be wrapped */ + +/* + * Lock history stacks; we have 3 nested lock history stacks: + * + * Hard IRQ + * Soft IRQ + * History / Task + * + * The thing is that once we complete a (Hard/Soft) IRQ the future task locks + * should not depend on any of the locks observed while running the IRQ. + * + * So what we do is rewind the history buffer and erase all our knowledge of + * that temporal event. + */ + +/* + * We need this to annotate lock history boundaries. Take for instance + * workqueues; each work is independent of the last. The completion of a future + * work does not depend on the completion of a past work (in general). + * Therefore we must not carry that (lock) dependency across works. + * + * This is true for many things; pretty much all kthreads fall into this + * pattern, where they have an 'idle' state and future completions do not + * depend on past completions. Its just that since they all have the 'same' + * form -- the kthread does the same over and over -- it doesn't typically + * matter. + * + * The same is true for system-calls, once a system call is completed (we've + * returned to userspace) the next system call does not depend on the lock + * history of the previous system call. + */ +void crossrelease_hist_start(enum xhlock_context_t c) +{ + if (current->xhlocks) + current->xhlock_idx_hist[c] = current->xhlock_idx; +} + +void crossrelease_hist_end(enum xhlock_context_t c) +{ + if (current->xhlocks) + current->xhlock_idx = current->xhlock_idx_hist[c]; +} + +static int cross_lock(struct lockdep_map *lock) +{ + return lock ? lock->cross : 0; +} + +/* + * This is needed to decide the relationship between wrapable variables. + */ +static inline int before(unsigned int a, unsigned int b) +{ + return (int)(a - b) < 0; +} + +static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) +{ + return hlock_class(&xhlock->hlock); +} + +static inline struct lock_class *xlock_class(struct cross_lock *xlock) +{ + return hlock_class(&xlock->hlock); +} + +/* + * Should we check a dependency with previous one? + */ +static inline int depend_before(struct held_lock *hlock) +{ + return hlock->read != 2 && hlock->check && !hlock->trylock; +} + +/* + * Should we check a dependency with next one? + */ +static inline int depend_after(struct held_lock *hlock) +{ + return hlock->read != 2 && hlock->check; +} + +/* + * Check if the xhlock is valid, which would be false if, + * + * 1. Has not used after initializaion yet. + * + * Remind hist_lock is implemented as a ring buffer. + */ +static inline int xhlock_valid(struct hist_lock *xhlock) +{ + /* + * xhlock->hlock.instance must be !NULL. + */ + return !!xhlock->hlock.instance; +} + +/* + * Record a hist_lock entry. + * + * Irq disable is only required. + */ +static void add_xhlock(struct held_lock *hlock) +{ + unsigned int idx = ++current->xhlock_idx; + struct hist_lock *xhlock = &xhlock(idx); + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * This can be done locklessly because they are all task-local + * state, we must however ensure IRQs are disabled. + */ + WARN_ON_ONCE(!irqs_disabled()); +#endif + + /* Initialize hist_lock's members */ + xhlock->hlock = *hlock; + + xhlock->trace.nr_entries = 0; + xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; + xhlock->trace.entries = xhlock->trace_entries; + xhlock->trace.skip = 3; + save_stack_trace(&xhlock->trace); +} + +static inline int same_context_xhlock(struct hist_lock *xhlock) +{ + return xhlock->hlock.irq_context == task_irq_context(current); +} + +/* + * This should be lockless as far as possible because this would be + * called very frequently. + */ +static void check_add_xhlock(struct held_lock *hlock) +{ + /* + * Record a hist_lock, only in case that acquisitions ahead + * could depend on the held_lock. For example, if the held_lock + * is trylock then acquisitions ahead never depends on that. + * In that case, we don't need to record it. Just return. + */ + if (!current->xhlocks || !depend_before(hlock)) + return; + + add_xhlock(hlock); +} + +/* + * For crosslock. + */ +static int add_xlock(struct held_lock *hlock) +{ + struct cross_lock *xlock; + unsigned int gen_id; + + if (!graph_lock()) + return 0; + + xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; + + gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); + xlock->hlock = *hlock; + xlock->hlock.gen_id = gen_id; + graph_unlock(); + + return 1; +} + +/* + * Called for both normal and crosslock acquires. Normal locks will be + * pushed on the hist_lock queue. Cross locks will record state and + * stop regular lock_acquire() to avoid being placed on the held_lock + * stack. + * + * Return: 0 - failure; + * 1 - crosslock, done; + * 2 - normal lock, continue to held_lock[] ops. + */ +static int lock_acquire_crosslock(struct held_lock *hlock) +{ + /* + * CONTEXT 1 CONTEXT 2 + * --------- --------- + * lock A (cross) + * X = atomic_inc_return(&cross_gen_id) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Y = atomic_read_acquire(&cross_gen_id) + * lock B + * + * atomic_read_acquire() is for ordering between A and B, + * IOW, A happens before B, when CONTEXT 2 see Y >= X. + * + * Pairs with atomic_inc_return() in add_xlock(). + */ + hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); + + if (cross_lock(hlock->instance)) + return add_xlock(hlock); + + check_add_xhlock(hlock); + return 2; +} + +static int copy_trace(struct stack_trace *trace) +{ + unsigned long *buf = stack_trace + nr_stack_trace_entries; + unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + unsigned int nr = min(max_nr, trace->nr_entries); + + trace->nr_entries = nr; + memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); + trace->entries = buf; + nr_stack_trace_entries += nr; + + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); + dump_stack(); + + return 0; + } + + return 1; +} + +static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) +{ + unsigned int xid, pid; + u64 chain_key; + + xid = xlock_class(xlock) - lock_classes; + chain_key = iterate_chain_key((u64)0, xid); + pid = xhlock_class(xhlock) - lock_classes; + chain_key = iterate_chain_key(chain_key, pid); + + if (lookup_chain_cache(chain_key)) + return 1; + + if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, + chain_key)) + return 0; + + if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, + &xhlock->trace, copy_trace)) + return 0; + + return 1; +} + +static void commit_xhlocks(struct cross_lock *xlock) +{ + unsigned int cur = current->xhlock_idx; + unsigned int i; + + if (!graph_lock()) + return; + + for (i = 0; i < MAX_XHLOCKS_NR; i++) { + struct hist_lock *xhlock = &xhlock(cur - i); + + if (!xhlock_valid(xhlock)) + break; + + if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) + break; + + if (!same_context_xhlock(xhlock)) + break; + + /* + * commit_xhlock() returns 0 with graph_lock already + * released if fail. + */ + if (!commit_xhlock(xlock, xhlock)) + return; + } + + graph_unlock(); +} + +void lock_commit_crosslock(struct lockdep_map *lock) +{ + struct cross_lock *xlock; + unsigned long flags; + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (!current->xhlocks) + return; + + /* + * Do commit hist_locks with the cross_lock, only in case that + * the cross_lock could depend on acquisitions after that. + * + * For example, if the cross_lock does not have the 'check' flag + * then we don't need to check dependencies and commit for that. + * Just skip it. In that case, of course, the cross_lock does + * not depend on acquisitions ahead, either. + * + * WARNING: Don't do that in add_xlock() in advance. When an + * acquisition context is different from the commit context, + * invalid(skipped) cross_lock might be accessed. + */ + if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + xlock = &((struct lockdep_map_cross *)lock)->xlock; + commit_xhlocks(xlock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_commit_crosslock); + +/* + * Return: 1 - crosslock, done; + * 2 - normal lock, continue to held_lock[] ops. + */ +static int lock_release_crosslock(struct lockdep_map *lock) +{ + return cross_lock(lock) ? 1 : 2; +} + +static void cross_init(struct lockdep_map *lock, int cross) +{ + lock->cross = cross; + + /* + * Crossrelease assumes that the ring buffer size of xhlocks + * is aligned with power of 2. So force it on build. + */ + BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); +} + +void lockdep_init_task(struct task_struct *task) +{ + int i; + + task->xhlock_idx = UINT_MAX; + + for (i = 0; i < XHLOCK_CTX_NR; i++) + task->xhlock_idx_hist[i] = UINT_MAX; + + task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, + GFP_KERNEL); +} + +void lockdep_free_task(struct task_struct *task) +{ + if (task->xhlocks) { + void *tmp = task->xhlocks; + /* Diable crossrelease for current */ + task->xhlocks = NULL; + kfree(tmp); + } +} +#endif diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca937b0c3a96..e86733a8b344 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2093,6 +2093,7 @@ __acquires(&pool->lock) lock_map_acquire_read(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); + crossrelease_hist_start(XHLOCK_PROC); trace_workqueue_execute_start(work); worker->current_func(work); /* @@ -2100,6 +2101,7 @@ __acquires(&pool->lock) * point will only record its address. */ trace_workqueue_execute_end(work); + crossrelease_hist_end(XHLOCK_PROC); lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); -- cgit v1.2.3 From 23f873d8f9526ed7e49a1a02a45f8afb9ae5fb84 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:53 +0900 Subject: locking/lockdep: Detect and handle hist_lock ring buffer overwrite The ring buffer can be overwritten by hardirq/softirq/work contexts. That cases must be considered on rollback or commit. For example, |<------ hist_lock ring buffer size ----->| ppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii wrapped > iiiiiiiiiiiiiiiiiiiiiii.................... where 'p' represents an acquisition in process context, 'i' represents an acquisition in irq context. On irq exit, crossrelease tries to rollback idx to original position, but it should not because the entry already has been invalid by overwriting 'i'. Avoid rollback or commit for entries overwritten. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-7-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 52 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 56f69cc53ddc..eda8114ef793 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4680,6 +4680,17 @@ EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); */ static atomic_t cross_gen_id; /* Can be wrapped */ +/* + * Make an entry of the ring buffer invalid. + */ +static inline void invalidate_xhlock(struct hist_lock *xhlock) +{ + /* + * Normally, xhlock->hlock.instance must be !NULL. + */ + xhlock->hlock.instance = NULL; +} + /* * Lock history stacks; we have 3 nested lock history stacks: * @@ -4712,14 +4723,28 @@ static atomic_t cross_gen_id; /* Can be wrapped */ */ void crossrelease_hist_start(enum xhlock_context_t c) { - if (current->xhlocks) - current->xhlock_idx_hist[c] = current->xhlock_idx; + struct task_struct *cur = current; + + if (cur->xhlocks) { + cur->xhlock_idx_hist[c] = cur->xhlock_idx; + cur->hist_id_save[c] = cur->hist_id; + } } void crossrelease_hist_end(enum xhlock_context_t c) { - if (current->xhlocks) - current->xhlock_idx = current->xhlock_idx_hist[c]; + struct task_struct *cur = current; + + if (cur->xhlocks) { + unsigned int idx = cur->xhlock_idx_hist[c]; + struct hist_lock *h = &xhlock(idx); + + cur->xhlock_idx = idx; + + /* Check if the ring was overwritten. */ + if (h->hist_id != cur->hist_id_save[c]) + invalidate_xhlock(h); + } } static int cross_lock(struct lockdep_map *lock) @@ -4765,6 +4790,7 @@ static inline int depend_after(struct held_lock *hlock) * Check if the xhlock is valid, which would be false if, * * 1. Has not used after initializaion yet. + * 2. Got invalidated. * * Remind hist_lock is implemented as a ring buffer. */ @@ -4796,6 +4822,7 @@ static void add_xhlock(struct held_lock *hlock) /* Initialize hist_lock's members */ xhlock->hlock = *hlock; + xhlock->hist_id = current->hist_id++; xhlock->trace.nr_entries = 0; xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; @@ -4934,6 +4961,7 @@ static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) static void commit_xhlocks(struct cross_lock *xlock) { unsigned int cur = current->xhlock_idx; + unsigned int prev_hist_id = xhlock(cur).hist_id; unsigned int i; if (!graph_lock()) @@ -4951,6 +4979,17 @@ static void commit_xhlocks(struct cross_lock *xlock) if (!same_context_xhlock(xhlock)) break; + /* + * Filter out the cases that the ring buffer was + * overwritten and the previous entry has a bigger + * hist_id than the following one, which is impossible + * otherwise. + */ + if (unlikely(before(xhlock->hist_id, prev_hist_id))) + break; + + prev_hist_id = xhlock->hist_id; + /* * commit_xhlock() returns 0 with graph_lock already * released if fail. @@ -5024,9 +5063,12 @@ void lockdep_init_task(struct task_struct *task) int i; task->xhlock_idx = UINT_MAX; + task->hist_id = 0; - for (i = 0; i < XHLOCK_CTX_NR; i++) + for (i = 0; i < XHLOCK_CTX_NR; i++) { task->xhlock_idx_hist[i] = UINT_MAX; + task->hist_id_save[i] = 0; + } task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, GFP_KERNEL); -- cgit v1.2.3 From 28a903f63ec0811ead70ad0f8665e838d207a25e Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:54 +0900 Subject: locking/lockdep: Handle non(or multi)-acquisition of a crosslock No acquisition might be in progress on commit of a crosslock. Completion operations enabling crossrelease are the case like: CONTEXT X CONTEXT Y --------- --------- trigger completion context complete AX commit AX wait_for_complete AX acquire AX wait where AX is a crosslock. When no acquisition is in progress, we should not perform commit because the lock does not exist, which might cause incorrect memory access. So we have to track the number of acquisitions of a crosslock to handle it. Moreover, in case that more than one acquisition of a crosslock are overlapped like: CONTEXT W CONTEXT X CONTEXT Y CONTEXT Z --------- --------- --------- --------- acquire AX (gen_id: 1) acquire A acquire AX (gen_id: 10) acquire B commit AX acquire C commit AX where A, B and C are typical locks and AX is a crosslock. Current crossrelease code performs commits in Y and Z with gen_id = 10. However, we can use gen_id = 1 to do it, since not only 'acquire AX in X' but 'acquire AX in W' also depends on each acquisition in Y and Z until their commits. So make it use gen_id = 1 instead of 10 on their commits, which adds an additional dependency 'AX -> A' in the example above. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-8-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 82 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index eda8114ef793..7f97871d48d5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4867,11 +4867,28 @@ static int add_xlock(struct held_lock *hlock) xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; + /* + * When acquisitions for a crosslock are overlapped, we use + * nr_acquire to perform commit for them, based on cross_gen_id + * of the first acquisition, which allows to add additional + * dependencies. + * + * Moreover, when no acquisition of a crosslock is in progress, + * we should not perform commit because the lock might not exist + * any more, which might cause incorrect memory access. So we + * have to track the number of acquisitions of a crosslock. + * + * depend_after() is necessary to initialize only the first + * valid xlock so that the xlock can be used on its commit. + */ + if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) + goto unlock; + gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); xlock->hlock = *hlock; xlock->hlock.gen_id = gen_id; +unlock: graph_unlock(); - return 1; } @@ -4967,35 +4984,37 @@ static void commit_xhlocks(struct cross_lock *xlock) if (!graph_lock()) return; - for (i = 0; i < MAX_XHLOCKS_NR; i++) { - struct hist_lock *xhlock = &xhlock(cur - i); + if (xlock->nr_acquire) { + for (i = 0; i < MAX_XHLOCKS_NR; i++) { + struct hist_lock *xhlock = &xhlock(cur - i); - if (!xhlock_valid(xhlock)) - break; + if (!xhlock_valid(xhlock)) + break; - if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) - break; + if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) + break; - if (!same_context_xhlock(xhlock)) - break; + if (!same_context_xhlock(xhlock)) + break; - /* - * Filter out the cases that the ring buffer was - * overwritten and the previous entry has a bigger - * hist_id than the following one, which is impossible - * otherwise. - */ - if (unlikely(before(xhlock->hist_id, prev_hist_id))) - break; + /* + * Filter out the cases that the ring buffer was + * overwritten and the previous entry has a bigger + * hist_id than the following one, which is impossible + * otherwise. + */ + if (unlikely(before(xhlock->hist_id, prev_hist_id))) + break; - prev_hist_id = xhlock->hist_id; + prev_hist_id = xhlock->hist_id; - /* - * commit_xhlock() returns 0 with graph_lock already - * released if fail. - */ - if (!commit_xhlock(xlock, xhlock)) - return; + /* + * commit_xhlock() returns 0 with graph_lock already + * released if fail. + */ + if (!commit_xhlock(xlock, xhlock)) + return; + } } graph_unlock(); @@ -5039,16 +5058,27 @@ void lock_commit_crosslock(struct lockdep_map *lock) EXPORT_SYMBOL_GPL(lock_commit_crosslock); /* - * Return: 1 - crosslock, done; + * Return: 0 - failure; + * 1 - crosslock, done; * 2 - normal lock, continue to held_lock[] ops. */ static int lock_release_crosslock(struct lockdep_map *lock) { - return cross_lock(lock) ? 1 : 2; + if (cross_lock(lock)) { + if (!graph_lock()) + return 0; + ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; + graph_unlock(); + return 1; + } + return 2; } static void cross_init(struct lockdep_map *lock, int cross) { + if (cross) + ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; + lock->cross = cross; /* -- cgit v1.2.3 From 383a4bc88849b804385162e81bf704f8f9690a87 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:55 +0900 Subject: locking/lockdep: Make print_circular_bug() aware of crossrelease print_circular_bug() reporting circular bug assumes that target hlock is owned by the current. However, in crossrelease, target hlock can be owned by other than the current. So the report format needs to be changed to reflect the change. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-9-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 67 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7f97871d48d5..1114dc42c27f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1139,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); + if (cross_lock(tgt->instance)) { + printk(" Possible unsafe locking scenario by crosslock:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk(" unlock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); + } else { + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); + } } /* @@ -1179,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); - pr_warn("\nbut task is already holding lock:\n"); + + if (cross_lock(check_tgt->instance)) + pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); + else + pr_warn("\nbut task is already holding lock:\n"); + print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); @@ -1197,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data) static noinline int print_circular_bug(struct lock_list *this, struct lock_list *target, struct held_lock *check_src, - struct held_lock *check_tgt) + struct held_lock *check_tgt, + struct stack_trace *trace) { struct task_struct *curr = current; struct lock_list *parent; @@ -1207,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (!save_trace(&this->trace)) + if (cross_lock(check_tgt->instance)) + this->trace = *trace; + else if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1864,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, this.parent = NULL; ret = check_noncircular(&this, hlock_class(prev), &target_entry); if (unlikely(!ret)) - return print_circular_bug(&this, target_entry, next, prev); + return print_circular_bug(&this, target_entry, next, prev, trace); else if (unlikely(ret < 0)) return print_bfs_bug(ret); -- cgit v1.2.3 From cd8084f91c02c1afd256a39aa833bff737631304 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 7 Aug 2017 16:12:56 +0900 Subject: locking/lockdep: Apply crossrelease to completions Although wait_for_completion() and its family can cause deadlock, the lock correctness validator could not be applied to them until now, because things like complete() are usually called in a different context from the waiting context, which violates lockdep's assumption. Thanks to CONFIG_LOCKDEP_CROSSRELEASE, we can now apply the lockdep detector to those completion operations. Applied it. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502089981-21272-10-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/completion.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae9bf2f..566b6ec7b6fe 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -32,6 +32,12 @@ void complete(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); + + /* + * Perform commit of crossrelease here. + */ + complete_release_commit(x); + if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); @@ -92,9 +98,14 @@ __wait_for_common(struct completion *x, { might_sleep(); + complete_acquire(x); + spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); + + complete_release(x); + return timeout; } -- cgit v1.2.3 From 90001d67be2fa2acbe3510d1f64fa6533efa30ef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 Jul 2017 17:50:05 +0200 Subject: sched/fair: Fix wake_affine() for !NUMA_BALANCING In commit: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Rik changed wake_affine to consider NUMA information when balancing between LLC domains. There are a number of problems here which this patch tries to address: - LLC < NODE; in this case we'd use the wrong information to balance - !NUMA_BALANCING: in this case, the new code doesn't do any balancing at all - re-computes the NUMA data for every wakeup, this can mean iterating up to 64 CPUs for every wakeup. - default affine wakeups inside a cache We address these by saving the load/capacity values for each sched_domain during regular load-balance and using these values in wake_affine_llc(). The obvious down-side to using cached values is that they can be too old and poorly reflect reality. But this way we can use LLC wide information and thus not rely on assuming LLC matches NODE. We also don't rely on NUMA_BALANCING nor do we have to aggegate two nodes (or even cache domains) worth of CPUs for each wakeup. Signed-off-by: Peter Zijlstra (Intel) Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") [ Minor readability improvements. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 190 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 122 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a7f1c3b797f8..8d5868771cb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2658,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } -/* - * Can a task be moved from prev_cpu to this_cpu without causing a load - * imbalance that would trigger the load balancer? - */ -static inline bool numa_wake_affine(struct sched_domain *sd, - struct task_struct *p, int this_cpu, - int prev_cpu, int sync) -{ - struct numa_stats prev_load, this_load; - s64 this_eff_load, prev_eff_load; - - update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); - update_numa_stats(&this_load, cpu_to_node(this_cpu)); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - unsigned long current_load = task_h_load(current); - - if (this_load.load > current_load) - this_load.load -= current_load; - else - this_load.load = 0; - } - - /* - * In low-load situations, where this_cpu's node is idle due to the - * sync cause above having dropped this_load.load to 0, move the task. - * Moving to an idle socket will not create a bad imbalance. - * - * Otherwise check if the nodes are near enough in load to allow this - * task to be woken on this_cpu's node. - */ - if (this_load.load > 0) { - unsigned long task_load = task_h_load(p); - - this_eff_load = 100; - this_eff_load *= prev_load.compute_capacity; - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= this_load.compute_capacity; - - this_eff_load *= this_load.load + task_load; - prev_eff_load *= prev_load.load - task_load; - - return this_eff_load <= prev_eff_load; - } - - return true; -} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2724,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } -#ifdef CONFIG_SMP -static inline bool numa_wake_affine(struct sched_domain *sd, - struct task_struct *p, int this_cpu, - int prev_cpu, int sync) -{ - return true; -} -#endif /* !SMP */ #endif /* CONFIG_NUMA_BALANCING */ static void @@ -5428,20 +5367,115 @@ static int wake_wide(struct task_struct *p) return 1; } +struct llc_stats { + unsigned long nr_running; + unsigned long load; + unsigned long capacity; + int has_capacity; +}; + +static bool get_llc_stats(struct llc_stats *stats, int cpu) +{ + struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + + if (!sds) + return false; + + stats->nr_running = READ_ONCE(sds->nr_running); + stats->load = READ_ONCE(sds->load); + stats->capacity = READ_ONCE(sds->capacity); + stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); + + return true; +} + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + * + * Since we're running on 'stale' values, we might in fact create an imbalance + * but recomputing these values is expensive, as that'd mean iteration 2 cache + * domains worth of CPUs. + */ +static bool +wake_affine_llc(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) +{ + struct llc_stats prev_stats, this_stats; + s64 this_eff_load, prev_eff_load; + unsigned long task_load; + + if (!get_llc_stats(&prev_stats, prev_cpu) || + !get_llc_stats(&this_stats, this_cpu)) + return false; + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current LLC. + */ + if (sync) { + unsigned long current_load = task_h_load(current); + + /* in this case load hits 0 and this LLC is considered 'idle' */ + if (current_load > this_stats.load) + return true; + + this_stats.load -= current_load; + } + + /* + * The has_capacity stuff is not SMT aware, but by trying to balance + * the nr_running on both ends we try and fill the domain at equal + * rates, thereby first consuming cores before siblings. + */ + + /* if the old cache has capacity, stay there */ + if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) + return false; + + /* if this cache has capacity, come here */ + if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1) + return true; + + /* + * Check to see if we can move the load without causing too much + * imbalance. + */ + task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_stats.capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_stats.capacity; + + this_eff_load *= this_stats.load + task_load; + prev_eff_load *= prev_stats.load - task_load; + + return this_eff_load <= prev_eff_load; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine = false; + bool affine; /* - * Common case: CPUs are in the same socket, and select_idle_sibling() - * will do its thing regardless of what we return: + * Default to no affine wakeups; wake_affine() should not effect a task + * placement the load-balancer feels inclined to undo. The conservative + * option is therefore to not move tasks when they wake up. */ - if (cpus_share_cache(prev_cpu, this_cpu)) - affine = true; - else - affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); + affine = false; + + /* + * If the wakeup is across cache domains, try to evaluate if movement + * makes sense, otherwise rely on select_idle_siblings() to do + * placement inside the cache domain. + */ + if (!cpus_share_cache(prev_cpu, this_cpu)) + affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (affine) { @@ -7121,6 +7155,7 @@ struct sg_lb_stats { struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ + unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ @@ -7140,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) *sds = (struct sd_lb_stats){ .busiest = NULL, .local = NULL, + .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, .busiest_stat = { @@ -7575,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { + struct sched_domain_shared *shared = env->sd->shared; struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; @@ -7631,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ + sds->total_running += sgs->sum_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -7646,6 +7684,21 @@ next_group: env->dst_rq->rd->overload = overload; } + if (!shared) + return; + + /* + * Since these are sums over groups they can contain some CPUs + * multiple times for the NUMA domains. + * + * Currently only wake_affine_llc() and find_busiest_group() + * uses these numbers, only the last is affected by this problem. + * + * XXX fix that. + */ + WRITE_ONCE(shared->nr_running, sds->total_running); + WRITE_ONCE(shared->load, sds->total_load); + WRITE_ONCE(shared->capacity, sds->total_capacity); } /** @@ -7875,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; + /* XXX broken for overlapping NUMA groups */ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) / sds.total_capacity; -- cgit v1.2.3 From 1e58565e6d147751d85ee9e692d1226059b3318f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 2 Aug 2017 14:13:00 +0530 Subject: sched/autogroup: Fix error reporting printk text in autogroup_create() Its kzalloc() not kmalloc() which has failed earlier. While here remove a redundant empty line. Signed-off-by: Anshuman Khandual Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170802084300.29487-1-khandual@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/autogroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index da39489d2d80..de6d7f4dfcb5 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void) goto out_fail; tg = sched_create_group(&root_task_group); - if (IS_ERR(tg)) goto out_free; @@ -101,7 +100,7 @@ out_free: out_fail: if (printk_ratelimit()) { printk(KERN_WARNING "autogroup_create: %s failure.\n", - ag ? "sched_create_group()" : "kmalloc()"); + ag ? "sched_create_group()" : "kzalloc()"); } return autogroup_kref_get(&autogroup_default); -- cgit v1.2.3 From d507e2ebd2c7be9138e5cf5c0cb1931c90c42ab1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 10 Aug 2017 15:23:31 -0700 Subject: mm: fix global NR_SLAB_.*CLAIMABLE counter reads As Tetsuo points out: "Commit 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") broke "Slab:" field of /proc/meminfo . It shows nearly 0kB" In addition to /proc/meminfo, this problem also affects the slab counters OOM/allocation failure info dumps, can cause early -ENOMEM from overcommit protection, and miscalculate image size requirements during suspend-to-disk. This is because the patch in question switched the slab counters from the zone level to the node level, but forgot to update the global accessor functions to read the aggregate node data instead of the aggregate zone data. Use global_node_page_state() to access the global slab counters. Fixes: 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") Link: http://lkml.kernel.org/r/20170801134256.5400-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Josef Bacik Cc: Vladimir Davydov Cc: Stefan Agner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 222317721c5a..0972a8e09d08 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable) { unsigned long size; - size = global_page_state(NR_SLAB_RECLAIMABLE) + size = global_node_page_state(NR_SLAB_RECLAIMABLE) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) -- cgit v1.2.3 From 16af97dc5a8975371a83d9e30a64038b48f40a2d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 10 Aug 2017 15:23:56 -0700 Subject: mm: migrate: prevent racy access to tlb_flush_pending Patch series "fixes of TLB batching races", v6. It turns out that Linux TLB batching mechanism suffers from various races. Races that are caused due to batching during reclamation were recently handled by Mel and this patch-set deals with others. The more fundamental issue is that concurrent updates of the page-tables allow for TLB flushes to be batched on one core, while another core changes the page-tables. This other core may assume a PTE change does not require a flush based on the updated PTE value, while it is unaware that TLB flushes are still pending. This behavior affects KSM (which may result in memory corruption) and MADV_FREE and MADV_DONTNEED (which may result in incorrect behavior). A proof-of-concept can easily produce the wrong behavior of MADV_DONTNEED. Memory corruption in KSM is harder to produce in practice, but was observed by hacking the kernel and adding a delay before flushing and replacing the KSM page. Finally, there is also one memory barrier missing, which may affect architectures with weak memory model. This patch (of 7): Setting and clearing mm->tlb_flush_pending can be performed by multiple threads, since mmap_sem may only be acquired for read in task_numa_work(). If this happens, tlb_flush_pending might be cleared while one of the threads still changes PTEs and batches TLB flushes. This can lead to the same race between migration and change_protection_range() that led to the introduction of tlb_flush_pending. The result of this race was data corruption, which means that this patch also addresses a theoretically possible data corruption. An actual data corruption was not observed, yet the race was was confirmed by adding assertion to check tlb_flush_pending is not set by two threads, adding artificial latency in change_protection_range() and using sysctl to reduce kernel.numa_balancing_scan_delay_ms. Link: http://lkml.kernel.org/r/20170802000818.4760-2-namit@vmware.com Fixes: 20841405940e ("mm: fix TLB flush race between migration, and change_protection_range") Signed-off-by: Nadav Amit Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Minchan Kim Cc: Andy Lutomirski Cc: Hugh Dickins Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 17921b0390b4..e075b7780421 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -807,7 +807,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_aio(mm); mm_init_owner(mm, p); mmu_notifier_mm_init(mm); - clear_tlb_flush_pending(mm); + init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif -- cgit v1.2.3 From 690cbb90a709c1b9389c6cb8e1978e77553ce0fb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Aug 2017 00:13:07 +0200 Subject: PM / s2idle: Rename PM_SUSPEND_FREEZE to PM_SUSPEND_TO_IDLE To make it clear that the symbol in question refers to suspend-to-idle, rename it from PM_SUSPEND_FREEZE to PM_SUSPEND_TO_IDLE. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 44 ++++++++++++++++++++++---------------------- kernel/power/suspend_test.c | 4 ++-- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0639d3a79852..6333078a438b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -35,19 +35,19 @@ #include "power.h" const char * const pm_labels[] = { - [PM_SUSPEND_FREEZE] = "freeze", + [PM_SUSPEND_TO_IDLE] = "freeze", [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", }; const char *pm_states[PM_SUSPEND_MAX]; static const char * const mem_sleep_labels[] = { - [PM_SUSPEND_FREEZE] = "s2idle", + [PM_SUSPEND_TO_IDLE] = "s2idle", [PM_SUSPEND_STANDBY] = "shallow", [PM_SUSPEND_MEM] = "deep", }; const char *mem_sleep_states[PM_SUSPEND_MAX]; -suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; +suspend_state_t mem_sleep_current = PM_SUSPEND_TO_IDLE; suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; suspend_state_t pm_suspend_target_state; EXPORT_SYMBOL_GPL(pm_suspend_target_state); @@ -76,7 +76,7 @@ static void freeze_begin(void) static void freeze_enter(void) { - trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); spin_lock_irq(&suspend_freeze_lock); if (pm_wakeup_pending()) @@ -103,7 +103,7 @@ static void freeze_enter(void) suspend_freeze_state = FREEZE_STATE_NONE; spin_unlock_irq(&suspend_freeze_lock); - trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false); } static void s2idle_loop(void) @@ -175,19 +175,19 @@ void __init pm_states_init(void) { /* "mem" and "freeze" are always present in /sys/power/state. */ pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; - pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE]; + pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE]; /* * Suspend-to-idle should be supported even without any suspend_ops, * initialize mem_sleep_states[] accordingly here. */ - mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE]; + mem_sleep_states[PM_SUSPEND_TO_IDLE] = mem_sleep_labels[PM_SUSPEND_TO_IDLE]; } static int __init mem_sleep_default_setup(char *str) { suspend_state_t state; - for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++) + for (state = PM_SUSPEND_TO_IDLE; state <= PM_SUSPEND_MEM; state++) if (mem_sleep_labels[state] && !strcmp(str, mem_sleep_labels[state])) { mem_sleep_default = state; @@ -239,48 +239,48 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static bool sleep_state_supported(suspend_state_t state) { - return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter); + return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter); } static int platform_suspend_prepare(suspend_state_t state) { - return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ? + return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare ? suspend_ops->prepare() : 0; } static int platform_suspend_prepare_late(suspend_state_t state) { - return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ? + return state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->prepare ? freeze_ops->prepare() : 0; } static int platform_suspend_prepare_noirq(suspend_state_t state) { - return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? + return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0; } static void platform_resume_noirq(suspend_state_t state) { - if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) + if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake) suspend_ops->wake(); } static void platform_resume_early(suspend_state_t state) { - if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore) + if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->restore) freeze_ops->restore(); } static void platform_resume_finish(suspend_state_t state) { - if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) + if (state != PM_SUSPEND_TO_IDLE && suspend_ops->finish) suspend_ops->finish(); } static int platform_suspend_begin(suspend_state_t state) { - if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) + if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->begin) return freeze_ops->begin(); else if (suspend_ops && suspend_ops->begin) return suspend_ops->begin(state); @@ -290,7 +290,7 @@ static int platform_suspend_begin(suspend_state_t state) static void platform_resume_end(suspend_state_t state) { - if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) + if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->end) freeze_ops->end(); else if (suspend_ops && suspend_ops->end) suspend_ops->end(); @@ -298,13 +298,13 @@ static void platform_resume_end(suspend_state_t state) static void platform_recover(suspend_state_t state) { - if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) + if (state != PM_SUSPEND_TO_IDLE && suspend_ops->recover) suspend_ops->recover(); } static bool platform_suspend_again(suspend_state_t state) { - return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ? + return state != PM_SUSPEND_TO_IDLE && suspend_ops->suspend_again ? suspend_ops->suspend_again() : false; } @@ -400,7 +400,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (error) goto Devices_early_resume; - if (state == PM_SUSPEND_FREEZE && pm_test_level != TEST_PLATFORM) { + if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) { s2idle_loop(); goto Platform_early_resume; } @@ -538,7 +538,7 @@ static int enter_state(suspend_state_t state) int error; trace_suspend_resume(TPS("suspend_enter"), state, true); - if (state == PM_SUSPEND_FREEZE) { + if (state == PM_SUSPEND_TO_IDLE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); @@ -551,7 +551,7 @@ static int enter_state(suspend_state_t state) if (!mutex_trylock(&pm_mutex)) return -EBUSY; - if (state == PM_SUSPEND_FREEZE) + if (state == PM_SUSPEND_TO_IDLE) freeze_begin(); #ifndef CONFIG_SUSPEND_SKIP_SYNC diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 5db217051232..6a897e8b2a88 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -104,9 +104,9 @@ repeat: printk(info_test, pm_states[state]); status = pm_suspend(state); if (status < 0) - state = PM_SUSPEND_FREEZE; + state = PM_SUSPEND_TO_IDLE; } - if (state == PM_SUSPEND_FREEZE) { + if (state == PM_SUSPEND_TO_IDLE) { printk(info_test, pm_states[state]); status = pm_suspend(state); } -- cgit v1.2.3 From f02f4f9d826590f1ef1b087374d3e3cfb7949eac Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Aug 2017 00:13:56 +0200 Subject: PM / s2idle: Rename freeze_state enum and related items Rename the freeze_state enum representing the suspend-to-idle state machine states to s2idle_states and rename the related variables and functions accordingly. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 48 ++++++++++++++++++++++++------------------------ kernel/sched/idle.c | 6 +++--- 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6333078a438b..ae9b579c2533 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -57,10 +57,10 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags); static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; -static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); +static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head); -enum freeze_state __read_mostly suspend_freeze_state; -static DEFINE_SPINLOCK(suspend_freeze_lock); +enum s2idle_states __read_mostly s2idle_state; +static DEFINE_SPINLOCK(s2idle_lock); void freeze_set_ops(const struct platform_freeze_ops *ops) { @@ -69,21 +69,21 @@ void freeze_set_ops(const struct platform_freeze_ops *ops) unlock_system_sleep(); } -static void freeze_begin(void) +static void s2idle_begin(void) { - suspend_freeze_state = FREEZE_STATE_NONE; + s2idle_state = S2IDLE_STATE_NONE; } -static void freeze_enter(void) +static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); - spin_lock_irq(&suspend_freeze_lock); + spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; - suspend_freeze_state = FREEZE_STATE_ENTER; - spin_unlock_irq(&suspend_freeze_lock); + s2idle_state = S2IDLE_STATE_ENTER; + spin_unlock_irq(&s2idle_lock); get_online_cpus(); cpuidle_resume(); @@ -91,17 +91,17 @@ static void freeze_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ - wait_event(suspend_freeze_wait_head, - suspend_freeze_state == FREEZE_STATE_WAKE); + wait_event(s2idle_wait_head, + s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); put_online_cpus(); - spin_lock_irq(&suspend_freeze_lock); + spin_lock_irq(&s2idle_lock); out: - suspend_freeze_state = FREEZE_STATE_NONE; - spin_unlock_irq(&suspend_freeze_lock); + s2idle_state = S2IDLE_STATE_NONE; + spin_unlock_irq(&s2idle_lock); trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false); } @@ -118,12 +118,12 @@ static void s2idle_loop(void) /* * Suspend-to-idle equals * frozen processes + suspended devices + idle processors. - * Thus freeze_enter() should be called right after + * Thus s2idle_enter() should be called right after * all devices have been suspended. */ error = dpm_noirq_suspend_devices(PMSG_SUSPEND); if (!error) - freeze_enter(); + s2idle_enter(); dpm_noirq_resume_devices(PMSG_RESUME); if (error && (error != -EBUSY || !pm_wakeup_pending())) { @@ -148,18 +148,18 @@ static void s2idle_loop(void) pm_pr_dbg("resume from suspend-to-idle\n"); } -void freeze_wake(void) +void s2idle_wake(void) { unsigned long flags; - spin_lock_irqsave(&suspend_freeze_lock, flags); - if (suspend_freeze_state > FREEZE_STATE_NONE) { - suspend_freeze_state = FREEZE_STATE_WAKE; - wake_up(&suspend_freeze_wait_head); + spin_lock_irqsave(&s2idle_lock, flags); + if (s2idle_state > S2IDLE_STATE_NONE) { + s2idle_state = S2IDLE_STATE_WAKE; + wake_up(&s2idle_wait_head); } - spin_unlock_irqrestore(&suspend_freeze_lock, flags); + spin_unlock_irqrestore(&s2idle_lock, flags); } -EXPORT_SYMBOL_GPL(freeze_wake); +EXPORT_SYMBOL_GPL(s2idle_wake); static bool valid_state(suspend_state_t state) { @@ -552,7 +552,7 @@ static int enter_state(suspend_state_t state) return -EBUSY; if (state == PM_SUSPEND_TO_IDLE) - freeze_begin(); + s2idle_begin(); #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6c23e30c0e5c..2ea4b7b23044 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -158,7 +158,7 @@ static void cpuidle_idle_call(void) } /* - * Suspend-to-idle ("freeze") is a system state in which all user space + * Suspend-to-idle ("s2idle") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only * activity happens here and in iterrupts (if any). In that case bypass * the cpuidle governor and go stratight for the deepest idle state @@ -167,8 +167,8 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ - if (idle_should_freeze() || dev->use_deepest_state) { - if (idle_should_freeze()) { + if (idle_should_enter_s2idle() || dev->use_deepest_state) { + if (idle_should_enter_s2idle()) { entered_state = cpuidle_enter_freeze(drv, dev); if (entered_state > 0) { local_irq_enable(); -- cgit v1.2.3 From 28ba086ed30fb3fb714598aa029b894c3754fa7b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Aug 2017 00:14:45 +0200 Subject: PM / s2idle: Rename ->enter_freeze to ->enter_s2idle Rename the ->enter_freeze cpuidle driver callback to ->enter_s2idle to make it clear that it is used for entering suspend-to-idle and rename the related functions, variables and so on accordingly. Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2ea4b7b23044..257f4f0b4532 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -169,7 +169,7 @@ static void cpuidle_idle_call(void) if (idle_should_enter_s2idle() || dev->use_deepest_state) { if (idle_should_enter_s2idle()) { - entered_state = cpuidle_enter_freeze(drv, dev); + entered_state = cpuidle_enter_s2idle(drv, dev); if (entered_state > 0) { local_irq_enable(); goto exit_idle; -- cgit v1.2.3 From 23d5855f4774f4f7c246a67057ecacc904696d8a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Aug 2017 00:15:30 +0200 Subject: PM / s2idle: Rename platform operations structure Rename struct platform_freeze_ops to platform_s2idle_ops to make it clear that the callbacks in it are used during suspend-to-idle suspend/resume transitions and rename the related functions, variables and so on accordingly. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ae9b579c2533..3e2b4f519009 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -56,16 +56,16 @@ unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); static const struct platform_suspend_ops *suspend_ops; -static const struct platform_freeze_ops *freeze_ops; +static const struct platform_s2idle_ops *s2idle_ops; static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; static DEFINE_SPINLOCK(s2idle_lock); -void freeze_set_ops(const struct platform_freeze_ops *ops) +void s2idle_set_ops(const struct platform_s2idle_ops *ops) { lock_system_sleep(); - freeze_ops = ops; + s2idle_ops = ops; unlock_system_sleep(); } @@ -131,13 +131,13 @@ static void s2idle_loop(void) break; } - if (freeze_ops && freeze_ops->wake) - freeze_ops->wake(); + if (s2idle_ops && s2idle_ops->wake) + s2idle_ops->wake(); dpm_noirq_end(); - if (freeze_ops && freeze_ops->sync) - freeze_ops->sync(); + if (s2idle_ops && s2idle_ops->sync) + s2idle_ops->sync(); if (pm_wakeup_pending()) break; @@ -250,8 +250,8 @@ static int platform_suspend_prepare(suspend_state_t state) static int platform_suspend_prepare_late(suspend_state_t state) { - return state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->prepare ? - freeze_ops->prepare() : 0; + return state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->prepare ? + s2idle_ops->prepare() : 0; } static int platform_suspend_prepare_noirq(suspend_state_t state) @@ -268,8 +268,8 @@ static void platform_resume_noirq(suspend_state_t state) static void platform_resume_early(suspend_state_t state) { - if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->restore) - freeze_ops->restore(); + if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->restore) + s2idle_ops->restore(); } static void platform_resume_finish(suspend_state_t state) @@ -280,8 +280,8 @@ static void platform_resume_finish(suspend_state_t state) static int platform_suspend_begin(suspend_state_t state) { - if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->begin) - return freeze_ops->begin(); + if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->begin) + return s2idle_ops->begin(); else if (suspend_ops && suspend_ops->begin) return suspend_ops->begin(state); else @@ -290,8 +290,8 @@ static int platform_suspend_begin(suspend_state_t state) static void platform_resume_end(suspend_state_t state) { - if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->end) - freeze_ops->end(); + if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->end) + s2idle_ops->end(); else if (suspend_ops && suspend_ops->end) suspend_ops->end(); } -- cgit v1.2.3 From 3e48930cc74f0c212ee1838f89ad0ca7fcf2fea1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Aug 2017 05:49:01 -0700 Subject: cgroup: misc changes Misc trivial changes to prepare for future changes. No functional difference. * Expose cgroup_get(), cgroup_tryget() and cgroup_parent(). * Implement task_dfl_cgroup() which dereferences css_set->dfl_cgrp. * Rename cgroup_stats_show() to cgroup_stat_show() for consistency with the file name. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f5ca55db1fe1..c038ccf95b5d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -319,15 +319,6 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static struct cgroup *cgroup_parent(struct cgroup *cgrp) -{ - struct cgroup_subsys_state *parent_css = cgrp->self.parent; - - if (parent_css) - return container_of(parent_css, struct cgroup, self); - return NULL; -} - static bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; @@ -534,22 +525,12 @@ out_unlock: return css; } -static void __maybe_unused cgroup_get(struct cgroup *cgrp) -{ - css_get(&cgrp->self); -} - static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); } -static bool cgroup_tryget(struct cgroup *cgrp) -{ - return css_tryget(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -3306,7 +3287,7 @@ static int cgroup_events_show(struct seq_file *seq, void *v) return 0; } -static int cgroup_stats_show(struct seq_file *seq, void *v) +static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; @@ -4423,7 +4404,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.stat", - .seq_show = cgroup_stats_show, + .seq_show = cgroup_stat_show, }, { } /* terminate */ }; -- cgit v1.2.3 From 696b98f244f77a79840bf420861c996d61c82637 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 9 Aug 2017 13:25:21 +0300 Subject: cgroup: remove unneeded checks "descendants" and "depth" are declared as int, so they can't be larger than INT_MAX. Static checkers complain and it's slightly confusing for humans as well so let's just remove these conditions. Signed-off-by: Dan Carpenter Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c038ccf95b5d..1591e9b20122 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3223,7 +3223,7 @@ static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, return ret; } - if (descendants < 0 || descendants > INT_MAX) + if (descendants < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); @@ -3266,7 +3266,7 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, return ret; } - if (depth < 0 || depth > INT_MAX) + if (depth < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); -- cgit v1.2.3 From 23a9b748a3d27f67cdb078fcb891a920285e75d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jun 2017 12:08:26 -0700 Subject: sched: Replace spin_unlock_wait() with lock/unlock pair There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore replaces the spin_unlock_wait() call in do_task_dead() with spin_lock() followed immediately by spin_unlock(). This should be safe from a performance perspective because the lock is this tasks ->pi_lock, and this is called only after the task exits. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Alan Stern Cc: Andrea Parri Cc: Linus Torvalds [ paulmck: Drop smp_mb() based on Peter Zijlstra's analysis: http://lkml.kernel.org/r/20170811144150.26gowhxte7ri5fpk@hirez.programming.kicks-ass.net ] --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b427b4..5d22323ae099 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3352,8 +3352,8 @@ void __noreturn do_task_dead(void) * To avoid it, we have to wait for releasing tsk->pi_lock which * is held by try_to_wake_up() */ - smp_mb(); - raw_spin_unlock_wait(¤t->pi_lock); + raw_spin_lock_irq(¤t->pi_lock); + raw_spin_unlock_irq(¤t->pi_lock); /* Causes final put_task_struct in finish_task_switch(): */ __set_current_state(TASK_DEAD); -- cgit v1.2.3 From 163616cf2f6ab7a8e37452ec00320039ab65bd45 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 9 Aug 2017 15:32:21 +0900 Subject: genirq: Fix for_each_action_of_desc() macro struct irq_desc does not have a member named "act". The correct name is "action". Currently, all users of this macro use an iterator named "action". If a different name is used, it will cause a build error. Fixes: f944b5a7aff0 ("genirq: Use a common macro to go through the actions list") Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Daniel Lezcano Cc: Jason Cooper Link: http://lkml.kernel.org/r/1502260341-28184-1-git-send-email-yamada.masahiro@socionext.com --- kernel/irq/internals.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a2c48058354c..a4aa39009f0d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -151,7 +151,7 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) #define for_each_action_of_desc(desc, act) \ - for (act = desc->act; act; act = act->next) + for (act = desc->action; act; act = act->next) struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, -- cgit v1.2.3 From a10b5c564741cd3b6708f085a1fa892b63c2063d Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 14 Aug 2017 16:00:51 +0900 Subject: locking/lockdep: Add a comment about crossrelease_hist_end() in lockdep_sys_exit() In lockdep_sys_exit(), crossrelease_hist_end() is called unconditionally even when getting here without having started e.g. just after forking. But it's no problem since it would roll back to an invalid entry anyway. Add a comment to explain this. Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: boqun.feng@gmail.com Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: linux-mm@kvack.org Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502694052-16085-2-git-send-email-byungchul.park@lge.com [ Improved the description and the comments. ] Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1114dc42c27f..257931e2fbbe 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4623,6 +4623,10 @@ asmlinkage __visible void lockdep_sys_exit(void) /* * The lock history for each syscall should be independent. So wipe the * slate clean on return to userspace. + * + * crossrelease_hist_end() works well here even when getting here + * without starting (i.e. just after forking), because it rolls back + * the index to point to the last entry, which is already invalid. */ crossrelease_hist_end(XHLOCK_PROC); crossrelease_hist_start(XHLOCK_PROC); -- cgit v1.2.3 From 907dc16d7e23ec81a126c9585435494fa1b3a4b7 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 14 Aug 2017 16:00:52 +0900 Subject: locking/lockdep: Fix the rollback and overwrite detection logic in crossrelease As Boqun Feng pointed out, current->hist_id should be aligned with the latest valid xhlock->hist_id so that hist_id_save[] storing current->hist_id can be comparable with xhlock->hist_id. Fix it. Additionally, the condition for overwrite-detection should be the opposite. Fix the code and the comments as well. <- direction to visit hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh (h: history) ^^ ^ || start from here |previous entry current entry Reported-by: Boqun Feng Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: kernel-team@lge.com Cc: kirill@shutemov.name Cc: linux-mm@kvack.org Cc: npiggin@gmail.com Cc: walken@google.com Cc: willy@infradead.org Link: http://lkml.kernel.org/r/1502694052-16085-3-git-send-email-byungchul.park@lge.com [ Improve the comments some more. ] Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 257931e2fbbe..66011c9f5df3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4853,7 +4853,7 @@ static void add_xhlock(struct held_lock *hlock) /* Initialize hist_lock's members */ xhlock->hlock = *hlock; - xhlock->hist_id = current->hist_id++; + xhlock->hist_id = ++current->hist_id; xhlock->trace.nr_entries = 0; xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; @@ -5029,12 +5029,12 @@ static void commit_xhlocks(struct cross_lock *xlock) break; /* - * Filter out the cases that the ring buffer was - * overwritten and the previous entry has a bigger - * hist_id than the following one, which is impossible - * otherwise. + * Filter out the cases where the ring buffer was + * overwritten and the current entry has a bigger + * hist_id than the previous one, which is impossible + * otherwise: */ - if (unlikely(before(xhlock->hist_id, prev_hist_id))) + if (unlikely(before(prev_hist_id, xhlock->hist_id))) break; prev_hist_id = xhlock->hist_id; -- cgit v1.2.3 From deb4de8b31bc5bf21efb6ac31150a01a631cd647 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Aug 2017 15:00:40 -0700 Subject: seccomp: Provide matching filter for introspection Both the upcoming logging improvements and changes to RET_KILL will need to know which filter a given seccomp return value originated from. In order to delay logic processing of result until after the seccomp loop, this adds a single pointer assignment on matches. This will allow both log and RET_KILL logic to work off the filter rather than doing more expensive tests inside the time-critical run_filters loop. Running tight cycles of getpid() with filters attached shows no measurable difference in speed. Suggested-by: Tyler Hicks Signed-off-by: Kees Cook Reviewed-by: Tyler Hicks --- kernel/seccomp.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 98b59b5db90b..1f3347fc2605 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -171,10 +171,14 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters + * @match: stores struct seccomp_filter that resulted in the return value, + * unless filter returned SECCOMP_RET_ALLOW, in which case it will + * be unchanged. * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(const struct seccomp_data *sd) +static u32 seccomp_run_filters(const struct seccomp_data *sd, + struct seccomp_filter **match) { struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; @@ -198,8 +202,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); - if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) { ret = cur_ret; + *match = f; + } } return ret; } @@ -566,6 +572,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { u32 filter_ret, action; + struct seccomp_filter *match = NULL; int data; /* @@ -574,7 +581,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ rmb(); - filter_ret = seccomp_run_filters(sd); + filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION; @@ -638,6 +645,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_ALLOW: + /* + * Note that the "match" filter will always be NULL for + * this action since SECCOMP_RET_ALLOW is the starting + * state in seccomp_run_filters(). + */ return 0; case SECCOMP_RET_KILL: -- cgit v1.2.3 From 8e5f1ad116df6b0de65eac458d5e7c318d1c05af Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:52 +0000 Subject: seccomp: Sysctl to display available actions This patch creates a read-only sysctl containing an ordered list of seccomp actions that the kernel supports. The ordering, from left to right, is the lowest action value (kill) to the highest action value (allow). Currently, a read of the sysctl file would return "kill trap errno trace allow". The contents of this sysctl file can be useful for userspace code as well as the system administrator. The path to the sysctl is: /proc/sys/kernel/seccomp/actions_avail libseccomp and other userspace code can easily determine which actions the current kernel supports. The set of actions supported by the current kernel may be different than the set of action macros found in kernel headers that were installed where the userspace code was built. In addition, this sysctl will allow system administrators to know which actions are supported by the kernel and make it easier to configure exactly what seccomp logs through the audit subsystem. Support for this level of logging configuration will come in a future patch. Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- kernel/seccomp.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1f3347fc2605..5f19f41e4e50 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -17,11 +17,13 @@ #include #include #include +#include #include #include #include #include #include +#include #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include @@ -934,3 +936,52 @@ out: return ret; } #endif + +#ifdef CONFIG_SYSCTL + +/* Human readable action names for friendly sysctl interaction */ +#define SECCOMP_RET_KILL_NAME "kill" +#define SECCOMP_RET_TRAP_NAME "trap" +#define SECCOMP_RET_ERRNO_NAME "errno" +#define SECCOMP_RET_TRACE_NAME "trace" +#define SECCOMP_RET_ALLOW_NAME "allow" + +static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " + SECCOMP_RET_TRAP_NAME " " + SECCOMP_RET_ERRNO_NAME " " + SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_ALLOW_NAME; + +static struct ctl_path seccomp_sysctl_path[] = { + { .procname = "kernel", }, + { .procname = "seccomp", }, + { } +}; + +static struct ctl_table seccomp_sysctl_table[] = { + { + .procname = "actions_avail", + .data = (void *) &seccomp_actions_avail, + .maxlen = sizeof(seccomp_actions_avail), + .mode = 0444, + .proc_handler = proc_dostring, + }, + { } +}; + +static int __init seccomp_sysctl_init(void) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); + if (!hdr) + pr_warn("seccomp: sysctl registration failed\n"); + else + kmemleak_not_leak(hdr); + + return 0; +} + +device_initcall(seccomp_sysctl_init) + +#endif /* CONFIG_SYSCTL */ -- cgit v1.2.3 From d612b1fd8010d0d67b5287fe146b8b55bcbb8655 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:53 +0000 Subject: seccomp: Operation for checking if an action is available Userspace code that needs to check if the kernel supports a given action may not be able to use the /proc/sys/kernel/seccomp/actions_avail sysctl. The process may be running in a sandbox and, therefore, sufficient filesystem access may not be available. This patch adds an operation to the seccomp(2) syscall that allows userspace code to ask the kernel if a given action is available. If the action is supported by the kernel, 0 is returned. If the action is not supported by the kernel, -1 is returned with errno set to -EOPNOTSUPP. If this check is attempted on a kernel that doesn't support this new operation, -1 is returned with errno set to -EINVAL meaning that userspace code will have the ability to differentiate between the two error cases. Signed-off-by: Tyler Hicks Suggested-by: Andy Lutomirski Signed-off-by: Kees Cook --- kernel/seccomp.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f19f41e4e50..7a6089f66fed 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -808,6 +808,27 @@ static inline long seccomp_set_mode_filter(unsigned int flags, } #endif +static long seccomp_get_action_avail(const char __user *uaction) +{ + u32 action; + + if (copy_from_user(&action, uaction, sizeof(action))) + return -EFAULT; + + switch (action) { + case SECCOMP_RET_KILL: + case SECCOMP_RET_TRAP: + case SECCOMP_RET_ERRNO: + case SECCOMP_RET_TRACE: + case SECCOMP_RET_ALLOW: + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, const char __user *uargs) @@ -819,6 +840,11 @@ static long do_seccomp(unsigned int op, unsigned int flags, return seccomp_set_mode_strict(); case SECCOMP_SET_MODE_FILTER: return seccomp_set_mode_filter(flags, uargs); + case SECCOMP_GET_ACTION_AVAIL: + if (flags != 0) + return -EINVAL; + + return seccomp_get_action_avail(uargs); default: return -EINVAL; } -- cgit v1.2.3 From 0ddec0fc8900201c0897b87b762b7c420436662f Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:54 +0000 Subject: seccomp: Sysctl to configure actions that are allowed to be logged Adminstrators can write to this sysctl to set the seccomp actions that are allowed to be logged. Any actions not found in this sysctl will not be logged. For example, all SECCOMP_RET_KILL, SECCOMP_RET_TRAP, and SECCOMP_RET_ERRNO actions would be loggable if "kill trap errno" were written to the sysctl. SECCOMP_RET_TRACE actions would not be logged since its string representation ("trace") wasn't present in the sysctl value. The path to the sysctl is: /proc/sys/kernel/seccomp/actions_logged The actions_avail sysctl can be read to discover the valid action names that can be written to the actions_logged sysctl with the exception of "allow". SECCOMP_RET_ALLOW actions cannot be configured for logging. The default setting for the sysctl is to allow all actions to be logged except SECCOMP_RET_ALLOW. While only SECCOMP_RET_KILL actions are currently logged, an upcoming patch will allow applications to request additional actions to be logged. There's one important exception to this sysctl. If a task is specifically being audited, meaning that an audit context has been allocated for the task, seccomp will log all actions other than SECCOMP_RET_ALLOW despite the value of actions_logged. This exception preserves the existing auditing behavior of tasks with an allocated audit context. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if audit_enabled && task-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- kernel/seccomp.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7a6089f66fed..54357e361aea 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -522,6 +522,45 @@ static void seccomp_send_sigsys(int syscall, int reason) } #endif /* CONFIG_SECCOMP_FILTER */ +/* For use with seccomp_actions_logged */ +#define SECCOMP_LOG_KILL (1 << 0) +#define SECCOMP_LOG_TRAP (1 << 2) +#define SECCOMP_LOG_ERRNO (1 << 3) +#define SECCOMP_LOG_TRACE (1 << 4) +#define SECCOMP_LOG_ALLOW (1 << 5) + +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | + SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE; + +static inline void seccomp_log(unsigned long syscall, long signr, u32 action) +{ + bool log = false; + + switch (action) { + case SECCOMP_RET_ALLOW: + case SECCOMP_RET_TRAP: + case SECCOMP_RET_ERRNO: + case SECCOMP_RET_TRACE: + break; + case SECCOMP_RET_KILL: + default: + log = seccomp_actions_logged & SECCOMP_LOG_KILL; + } + + /* + * Force an audit message to be emitted when the action is RET_KILL and + * the action is allowed to be logged by the admin. + */ + if (log) + return __audit_seccomp(syscall, signr, action); + + /* + * Let the audit subsystem decide if the action should be audited based + * on whether the current task itself is being audited. + */ + return audit_seccomp(syscall, signr, action); +} + /* * Secure computing mode 1 allows only read/write/exit/sigreturn. * To be fully secure this must be combined with rlimit @@ -547,7 +586,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL); do_exit(SIGKILL); } @@ -656,7 +695,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL: default: - audit_seccomp(this_syscall, SIGSYS, action); + seccomp_log(this_syscall, SIGSYS, action); /* Dump core only if this is the last remaining thread. */ if (get_nr_threads(current) == 1) { siginfo_t info; @@ -673,7 +712,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, unreachable(); skip: - audit_seccomp(this_syscall, 0, action); + seccomp_log(this_syscall, 0, action); return -1; } #else @@ -978,6 +1017,127 @@ static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " SECCOMP_RET_TRACE_NAME " " SECCOMP_RET_ALLOW_NAME; +struct seccomp_log_name { + u32 log; + const char *name; +}; + +static const struct seccomp_log_name seccomp_log_names[] = { + { SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME }, + { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, + { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, + { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, + { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, + { } +}; + +static bool seccomp_names_from_actions_logged(char *names, size_t size, + u32 actions_logged) +{ + const struct seccomp_log_name *cur; + bool append_space = false; + + for (cur = seccomp_log_names; cur->name && size; cur++) { + ssize_t ret; + + if (!(actions_logged & cur->log)) + continue; + + if (append_space) { + ret = strscpy(names, " ", size); + if (ret < 0) + return false; + + names += ret; + size -= ret; + } else + append_space = true; + + ret = strscpy(names, cur->name, size); + if (ret < 0) + return false; + + names += ret; + size -= ret; + } + + return true; +} + +static bool seccomp_action_logged_from_name(u32 *action_logged, + const char *name) +{ + const struct seccomp_log_name *cur; + + for (cur = seccomp_log_names; cur->name; cur++) { + if (!strcmp(cur->name, name)) { + *action_logged = cur->log; + return true; + } + } + + return false; +} + +static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) +{ + char *name; + + *actions_logged = 0; + while ((name = strsep(&names, " ")) && *name) { + u32 action_logged = 0; + + if (!seccomp_action_logged_from_name(&action_logged, name)) + return false; + + *actions_logged |= action_logged; + } + + return true; +} + +static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + char names[sizeof(seccomp_actions_avail)]; + struct ctl_table table; + int ret; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + memset(names, 0, sizeof(names)); + + if (!write) { + if (!seccomp_names_from_actions_logged(names, sizeof(names), + seccomp_actions_logged)) + return -EINVAL; + } + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + ret = proc_dostring(&table, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (write) { + u32 actions_logged; + + if (!seccomp_actions_logged_from_names(&actions_logged, + table.data)) + return -EINVAL; + + if (actions_logged & SECCOMP_LOG_ALLOW) + return -EINVAL; + + seccomp_actions_logged = actions_logged; + } + + return 0; +} + static struct ctl_path seccomp_sysctl_path[] = { { .procname = "kernel", }, { .procname = "seccomp", }, @@ -992,6 +1152,11 @@ static struct ctl_table seccomp_sysctl_table[] = { .mode = 0444, .proc_handler = proc_dostring, }, + { + .procname = "actions_logged", + .mode = 0644, + .proc_handler = seccomp_actions_logged_handler, + }, { } }; -- cgit v1.2.3 From e66a39977985b1e69e17c4042cb290768eca9b02 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:56 +0000 Subject: seccomp: Filter flag to log all actions except SECCOMP_RET_ALLOW Add a new filter flag, SECCOMP_FILTER_FLAG_LOG, that enables logging for all actions except for SECCOMP_RET_ALLOW for the given filter. SECCOMP_RET_KILL actions are always logged, when "kill" is in the actions_logged sysctl, and SECCOMP_RET_ALLOW actions are never logged, regardless of this flag. This flag can be used to create noisy filters that result in all non-allowed actions to be logged. A process may have one noisy filter, which is loaded with this flag, as well as a quiet filter that's not loaded with this flag. This allows for the actions in a set of filters to be selectively conveyed to the admin. Since a system could have a large number of allocated seccomp_filter structs, struct packing was taken in consideration. On 64 bit x86, the new log member takes up one byte of an existing four byte hole in the struct. On 32 bit x86, the new log member creates a new four byte hole (unavoidable) and consumes one of those bytes. Unfortunately, the tests added for SECCOMP_FILTER_FLAG_LOG are not capable of inspecting the audit log to verify that the actions taken in the filter were logged. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if filter-requests-logging && action in actions_logged: log else if audit_enabled && process-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- kernel/seccomp.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54357e361aea..ed9fde418fc4 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -44,6 +44,7 @@ * get/put helpers should be used when accessing an instance * outside of a lifetime-guarded section. In general, this * is only needed for handling filters shared across tasks. + * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @@ -59,6 +60,7 @@ */ struct seccomp_filter { refcount_t usage; + bool log; struct seccomp_filter *prev; struct bpf_prog *prog; }; @@ -452,6 +454,10 @@ static long seccomp_attach_filter(unsigned int flags, return ret; } + /* Set log flag, if present. */ + if (flags & SECCOMP_FILTER_FLAG_LOG) + filter->log = true; + /* * If there is an existing filter, make it the prev and don't drop its * task reference. @@ -532,15 +538,22 @@ static void seccomp_send_sigsys(int syscall, int reason) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE; -static inline void seccomp_log(unsigned long syscall, long signr, u32 action) +static inline void seccomp_log(unsigned long syscall, long signr, u32 action, + bool requested) { bool log = false; switch (action) { case SECCOMP_RET_ALLOW: + break; case SECCOMP_RET_TRAP: + log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP; + break; case SECCOMP_RET_ERRNO: + log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO; + break; case SECCOMP_RET_TRACE: + log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; case SECCOMP_RET_KILL: default: @@ -548,8 +561,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action) } /* - * Force an audit message to be emitted when the action is RET_KILL and - * the action is allowed to be logged by the admin. + * Force an audit message to be emitted when the action is RET_KILL or + * the FILTER_FLAG_LOG bit was set and the action is allowed to be + * logged by the admin. */ if (log) return __audit_seccomp(syscall, signr, action); @@ -586,7 +600,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true); do_exit(SIGKILL); } @@ -695,7 +709,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL: default: - seccomp_log(this_syscall, SIGSYS, action); + seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (get_nr_threads(current) == 1) { siginfo_t info; @@ -712,7 +726,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, unreachable(); skip: - seccomp_log(this_syscall, 0, action); + seccomp_log(this_syscall, 0, action, match ? match->log : false); return -1; } #else -- cgit v1.2.3 From 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:57 +0000 Subject: seccomp: Action to log before allowing Add a new action, SECCOMP_RET_LOG, that logs a syscall before allowing the syscall. At the implementation level, this action is identical to the existing SECCOMP_RET_ALLOW action. However, it can be very useful when initially developing a seccomp filter for an application. The developer can set the default action to be SECCOMP_RET_LOG, maybe mark any obviously needed syscalls with SECCOMP_RET_ALLOW, and then put the application through its paces. A list of syscalls that triggered the default action (SECCOMP_RET_LOG) can be easily gleaned from the logs and that list can be used to build the syscall whitelist. Finally, the developer can change the default action to the desired value. This provides a more friendly experience than seeing the application get killed, then updating the filter and rebuilding the app, seeing the application get killed due to a different syscall, then updating the filter and rebuilding the app, etc. The functionality is similar to what's supported by the various LSMs. SELinux has permissive mode, AppArmor has complain mode, SMACK has bring-up mode, etc. SECCOMP_RET_LOG is given a lower value than SECCOMP_RET_ALLOW as allow while logging is slightly more restrictive than quietly allowing. Unfortunately, the tests added for SECCOMP_RET_LOG are not capable of inspecting the audit log to verify that the syscall was logged. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if action == RET_LOG && RET_LOG in actions_logged: log else if filter-requests-logging && action in actions_logged: log else if audit_enabled && process-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- kernel/seccomp.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ed9fde418fc4..59cde2ed3b92 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -533,10 +533,12 @@ static void seccomp_send_sigsys(int syscall, int reason) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) -#define SECCOMP_LOG_ALLOW (1 << 5) +#define SECCOMP_LOG_LOG (1 << 5) +#define SECCOMP_LOG_ALLOW (1 << 6) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | - SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE; + SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE | + SECCOMP_LOG_LOG; static inline void seccomp_log(unsigned long syscall, long signr, u32 action, bool requested) @@ -555,15 +557,18 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, case SECCOMP_RET_TRACE: log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; + case SECCOMP_RET_LOG: + log = seccomp_actions_logged & SECCOMP_LOG_LOG; + break; case SECCOMP_RET_KILL: default: log = seccomp_actions_logged & SECCOMP_LOG_KILL; } /* - * Force an audit message to be emitted when the action is RET_KILL or - * the FILTER_FLAG_LOG bit was set and the action is allowed to be - * logged by the admin. + * Force an audit message to be emitted when the action is RET_KILL, + * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is + * allowed to be logged by the admin. */ if (log) return __audit_seccomp(syscall, signr, action); @@ -699,6 +704,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; + case SECCOMP_RET_LOG: + seccomp_log(this_syscall, 0, action, true); + return 0; + case SECCOMP_RET_ALLOW: /* * Note that the "match" filter will always be NULL for @@ -873,6 +882,7 @@ static long seccomp_get_action_avail(const char __user *uaction) case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: case SECCOMP_RET_TRACE: + case SECCOMP_RET_LOG: case SECCOMP_RET_ALLOW: break; default: @@ -1023,12 +1033,14 @@ out: #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" #define SECCOMP_RET_TRACE_NAME "trace" +#define SECCOMP_RET_LOG_NAME "log" #define SECCOMP_RET_ALLOW_NAME "allow" static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " SECCOMP_RET_TRAP_NAME " " SECCOMP_RET_ERRNO_NAME " " SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_LOG_NAME " " SECCOMP_RET_ALLOW_NAME; struct seccomp_log_name { @@ -1041,6 +1053,7 @@ static const struct seccomp_log_name seccomp_log_names[] = { { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, + { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME }, { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, { } }; -- cgit v1.2.3 From fd76875ca289a3d4722f266fd2d5532a27083903 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 12:53:18 -0700 Subject: seccomp: Rename SECCOMP_RET_KILL to SECCOMP_RET_KILL_THREAD In preparation for adding SECCOMP_RET_KILL_PROCESS, rename SECCOMP_RET_KILL to the more accurate SECCOMP_RET_KILL_THREAD. The existing selftest values are intentionally left as SECCOMP_RET_KILL just to be sure we're exercising the alias. Signed-off-by: Kees Cook --- kernel/seccomp.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 59cde2ed3b92..95ac54cff00f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) - return SECCOMP_RET_KILL; + return SECCOMP_RET_KILL_THREAD; if (!sd) { populate_seccomp_data(&sd_local); @@ -529,15 +529,17 @@ static void seccomp_send_sigsys(int syscall, int reason) #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ -#define SECCOMP_LOG_KILL (1 << 0) +#define SECCOMP_LOG_KILL_THREAD (1 << 0) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) -static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | - SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE | +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD | + SECCOMP_LOG_TRAP | + SECCOMP_LOG_ERRNO | + SECCOMP_LOG_TRACE | SECCOMP_LOG_LOG; static inline void seccomp_log(unsigned long syscall, long signr, u32 action, @@ -560,13 +562,13 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, case SECCOMP_RET_LOG: log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: default: - log = seccomp_actions_logged & SECCOMP_LOG_KILL; + log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; } /* - * Force an audit message to be emitted when the action is RET_KILL, + * Force an audit message to be emitted when the action is RET_KILL_*, * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is * allowed to be logged by the admin. */ @@ -605,7 +607,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -716,7 +718,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ return 0; - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ @@ -878,7 +880,7 @@ static long seccomp_get_action_avail(const char __user *uaction) return -EFAULT; switch (action) { - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: case SECCOMP_RET_TRACE: @@ -1029,19 +1031,20 @@ out: #ifdef CONFIG_SYSCTL /* Human readable action names for friendly sysctl interaction */ -#define SECCOMP_RET_KILL_NAME "kill" +#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" #define SECCOMP_RET_TRACE_NAME "trace" #define SECCOMP_RET_LOG_NAME "log" #define SECCOMP_RET_ALLOW_NAME "allow" -static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " - SECCOMP_RET_TRAP_NAME " " - SECCOMP_RET_ERRNO_NAME " " - SECCOMP_RET_TRACE_NAME " " - SECCOMP_RET_LOG_NAME " " - SECCOMP_RET_ALLOW_NAME; +static const char seccomp_actions_avail[] = + SECCOMP_RET_KILL_THREAD_NAME " " + SECCOMP_RET_TRAP_NAME " " + SECCOMP_RET_ERRNO_NAME " " + SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_LOG_NAME " " + SECCOMP_RET_ALLOW_NAME; struct seccomp_log_name { u32 log; @@ -1049,7 +1052,7 @@ struct seccomp_log_name { }; static const struct seccomp_log_name seccomp_log_names[] = { - { SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME }, + { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, -- cgit v1.2.3 From 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:01:39 -0700 Subject: seccomp: Introduce SECCOMP_RET_KILL_PROCESS This introduces the BPF return value for SECCOMP_RET_KILL_PROCESS to kill an entire process. This cannot yet be reached by seccomp, but it changes the default-kill behavior (for unknown return values) from kill-thread to kill-process. Signed-off-by: Kees Cook --- kernel/seccomp.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 95ac54cff00f..5c7299b9d953 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) - return SECCOMP_RET_KILL_THREAD; + return SECCOMP_RET_KILL_PROCESS; if (!sd) { populate_seccomp_data(&sd_local); @@ -529,14 +529,16 @@ static void seccomp_send_sigsys(int syscall, int reason) #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ -#define SECCOMP_LOG_KILL_THREAD (1 << 0) +#define SECCOMP_LOG_KILL_PROCESS (1 << 0) +#define SECCOMP_LOG_KILL_THREAD (1 << 1) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) -static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD | +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | + SECCOMP_LOG_KILL_THREAD | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE | @@ -563,8 +565,11 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; case SECCOMP_RET_KILL_THREAD: - default: log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; + break; + case SECCOMP_RET_KILL_PROCESS: + default: + log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS; } /* @@ -719,10 +724,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_KILL_THREAD: + case SECCOMP_RET_KILL_PROCESS: default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ - if (get_nr_threads(current) == 1) { + if (action == SECCOMP_RET_KILL_PROCESS || + get_nr_threads(current) == 1) { siginfo_t info; /* Show the original registers in the dump. */ @@ -731,7 +738,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, seccomp_init_siginfo(&info, this_syscall, data); do_coredump(&info); } - do_exit(SIGSYS); + if (action == SECCOMP_RET_KILL_PROCESS) + do_group_exit(SIGSYS); + else + do_exit(SIGSYS); } unreachable(); -- cgit v1.2.3 From 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:12:11 -0700 Subject: seccomp: Implement SECCOMP_RET_KILL_PROCESS action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, SECCOMP_RET_KILL_THREAD (neé SECCOMP_RET_KILL) kills the current thread. There have been a few requests for this to kill the entire process (the thread group). This cannot be just changed (discovered when adding coredump support since coredumping kills the entire process) because there are userspace programs depending on the thread-kill behavior. Instead, implement SECCOMP_RET_KILL_PROCESS, which is 0x80000000, and can be processed as "-1" by the kernel, below the existing RET_KILL that is ABI-set to "0". For userspace, SECCOMP_RET_ACTION_FULL is added to expand the mask to the signed bit. Old userspace using the SECCOMP_RET_ACTION mask will see SECCOMP_RET_KILL_PROCESS as 0 still, but this would only be visible when examining the siginfo in a core dump from a RET_KILL_*, where it will think it was thread-killed instead of process-killed. Attempts to introduce this behavior via other ways (filter flags, seccomp struct flags, masked RET_DATA bits) all come with weird side-effects and baggage. This change preserves the central behavioral expectations of the seccomp filter engine without putting too great a burden on changes needed in userspace to use the new action. The new action is discoverable by userspace through either the new actions_avail sysctl or through the SECCOMP_GET_ACTION_AVAIL seccomp operation. If used without checking for availability, old kernels will treat RET_KILL_PROCESS as RET_KILL_THREAD (since the old mask will produce RET_KILL_THREAD). Cc: Paul Moore Cc: Fabricio Voznika Signed-off-by: Kees Cook --- kernel/seccomp.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5c7299b9d953..c24579dfa7a1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -181,6 +181,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ +#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL))) static u32 seccomp_run_filters(const struct seccomp_data *sd, struct seccomp_filter **match) { @@ -206,7 +207,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); - if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) { + if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { ret = cur_ret; *match = f; } @@ -650,7 +651,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; - action = filter_ret & SECCOMP_RET_ACTION; + action = filter_ret & SECCOMP_RET_ACTION_FULL; switch (action) { case SECCOMP_RET_ERRNO: @@ -890,6 +891,7 @@ static long seccomp_get_action_avail(const char __user *uaction) return -EFAULT; switch (action) { + case SECCOMP_RET_KILL_PROCESS: case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: @@ -1041,6 +1043,7 @@ out: #ifdef CONFIG_SYSCTL /* Human readable action names for friendly sysctl interaction */ +#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process" #define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" @@ -1049,6 +1052,7 @@ out: #define SECCOMP_RET_ALLOW_NAME "allow" static const char seccomp_actions_avail[] = + SECCOMP_RET_KILL_PROCESS_NAME " " SECCOMP_RET_KILL_THREAD_NAME " " SECCOMP_RET_TRAP_NAME " " SECCOMP_RET_ERRNO_NAME " " @@ -1062,6 +1066,7 @@ struct seccomp_log_name { }; static const struct seccomp_log_name seccomp_log_names[] = { + { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME }, { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, -- cgit v1.2.3 From 077a1cc06f72f95efd077d433993c16191008e47 Mon Sep 17 00:00:00 2001 From: Nikitas Angelinas Date: Sat, 29 Jul 2017 20:36:36 -0700 Subject: printk: Clean up do_syslog() error handling The error variable in do_syslog() is preemptively set to the error code before the error condition is checked, and then set to 0 if the error condition is not encountered. This is not necessary, as it is likely simpler to return immediately upon encountering the error condition. A redundant set of the error variable to 0 is also removed. This patch has been build-tested on x86_64, but not tested for functionality. Link: http://lkml.kernel.org/r/20170730033636.GA935@vostro Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Signed-off-by: Nikitas Angelinas Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 87f1a8f4e0f9..cfd9ab1b80c5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1435,7 +1435,7 @@ int do_syslog(int type, char __user *buf, int len, int source) error = check_syslog_permissions(type, source); if (error) - goto out; + return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ @@ -1443,20 +1443,16 @@ int do_syslog(int type, char __user *buf, int len, int source) case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ - error = -EINVAL; if (!buf || len < 0) - goto out; - error = 0; + return -EINVAL; if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } + return 0; + if (!access_ok(VERIFY_WRITE, buf, len)) + return -EFAULT; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); if (error) - goto out; + return error; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ @@ -1465,16 +1461,12 @@ int do_syslog(int type, char __user *buf, int len, int source) /* FALL THRU */ /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: - error = -EINVAL; if (!buf || len < 0) - goto out; - error = 0; + return -EINVAL; if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } + return 0; + if (!access_ok(VERIFY_WRITE, buf, len)) + return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ @@ -1496,15 +1488,13 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: - error = -EINVAL; if (len < 1 || len > 8) - goto out; + return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; - error = 0; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: @@ -1526,7 +1516,6 @@ int do_syslog(int type, char __user *buf, int len, int source) u64 seq = syslog_seq; u32 idx = syslog_idx; - error = 0; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); @@ -1546,7 +1535,7 @@ int do_syslog(int type, char __user *buf, int len, int source) error = -EINVAL; break; } -out: + return error; } -- cgit v1.2.3 From 48ac3c18cc62d4a23d5dc5c59f8720589d0de14b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 14 Jul 2017 12:23:09 +0100 Subject: fork: allow arch-override of VMAP stack alignment In some cases, an architecture might wish its stacks to be aligned to a boundary larger than THREAD_SIZE. For example, using an alignment of double THREAD_SIZE can allow for stack overflows smaller than THREAD_SIZE to be detected by checking a single bit of the stack pointer. This patch allows architectures to override the alignment of VMAP'd stacks, by defining THREAD_ALIGN. Where not defined, this defaults to THREAD_SIZE, as is the case today. Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Tested-by: Laura Abbott Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: James Morse Cc: linux-kernel@vger.kernel.org --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 17921b0390b4..f12882a2323b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -88,6 +88,7 @@ #include #include #include +#include #include #include @@ -217,7 +218,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) return s->addr; } - stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL, -- cgit v1.2.3 From d76036ab47eafa6ce52b69482e91ca3ba337d6d6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2017 13:00:36 +0200 Subject: audit: Fix use after free in audit_remove_watch_rule() audit_remove_watch_rule() drops watch's reference to parent but then continues to work with it. That is not safe as parent can get freed once we drop our reference. The following is a trivial reproducer: mount -o loop image /mnt touch /mnt/file auditctl -w /mnt/file -p wax umount /mnt auditctl -D Grab our own reference in audit_remove_watch_rule() earlier to make sure mark does not get freed under us. CC: stable@vger.kernel.org Reported-by: Tony Jones Signed-off-by: Jan Kara Tested-by: Tony Jones Signed-off-by: Paul Moore --- kernel/audit_watch.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e0656bd63036..1c7ded42f82f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule) list_del(&krule->rlist); if (list_empty(&watch->rules)) { + /* + * audit_remove_watch() drops our reference to 'parent' which + * can get freed. Grab our own reference to be safe. + */ + audit_get_parent(parent); audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - audit_get_parent(parent); + if (list_empty(&parent->watches)) fsnotify_destroy_mark(&parent->mark, audit_watch_group); - audit_put_parent(parent); - } + audit_put_parent(parent); } } -- cgit v1.2.3 From b5fed474b98332559f2590c6bc90388a0899e793 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2017 13:00:37 +0200 Subject: audit: Receive unmount event Although audit_watch_handle_event() can handle FS_UNMOUNT event, it is not part of AUDIT_FS_WATCH mask and thus such event never gets to audit_watch_handle_event(). Thus fsnotify marks are deleted by fsnotify subsystem on unmount without audit being notified about that which leads to a strange state of existing audit rules with dead fsnotify marks. Add FS_UNMOUNT to the mask of events to be received so that audit can clean up its state accordingly. Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1c7ded42f82f..d1b5857b7e33 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) + FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT) static void audit_free_parent(struct audit_parent *parent) { -- cgit v1.2.3 From dc503a8ad98474ea0073a1c5c4d9f18cb8dd0dbf Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 15 Aug 2017 20:34:35 +0100 Subject: bpf/verifier: track liveness for pruning State of a register doesn't matter if it wasn't read in reaching an exit; a write screens off all reads downstream of it from all explored_states upstream of it. This allows us to prune many more branches; here are some processed insn counts for some Cilium programs: Program before after bpf_lb_opt_-DLB_L3.o 6515 3361 bpf_lb_opt_-DLB_L4.o 8976 5176 bpf_lb_opt_-DUNKNOWN.o 2960 1137 bpf_lxc_opt_-DDROP_ALL.o 95412 48537 bpf_lxc_opt_-DUNKNOWN.o 141706 78718 bpf_netdev.o 24251 17995 bpf_overlay.o 10999 9385 The runtime is also improved; here are 'time' results in ms: Program before after bpf_lb_opt_-DLB_L3.o 24 6 bpf_lb_opt_-DLB_L4.o 26 11 bpf_lb_opt_-DUNKNOWN.o 11 2 bpf_lxc_opt_-DDROP_ALL.o 1288 139 bpf_lxc_opt_-DUNKNOWN.o 1768 234 bpf_netdev.o 62 31 bpf_overlay.o 15 13 Signed-off-by: Edward Cree Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 189 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 146 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ecc590e01a1d..7dd96d064be1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -629,8 +629,10 @@ static void init_reg_state(struct bpf_reg_state *regs) { int i; - for (i = 0; i < MAX_BPF_REG; i++) + for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(regs, i); + regs[i].live = REG_LIVE_NONE; + } /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; @@ -647,9 +649,26 @@ enum reg_arg_type { DST_OP_NO_MARK /* same as above, check only, don't mark */ }; -static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, +static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) +{ + struct bpf_verifier_state *parent = state->parent; + + while (parent) { + /* if read wasn't screened by an earlier write ... */ + if (state->regs[regno].live & REG_LIVE_WRITTEN) + break; + /* ... then we depend on parent's value */ + parent->regs[regno].live |= REG_LIVE_READ; + state = parent; + parent = state->parent; + } +} + +static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { + struct bpf_reg_state *regs = env->cur_state.regs; + if (regno >= MAX_BPF_REG) { verbose("R%d is invalid\n", regno); return -EINVAL; @@ -661,12 +680,14 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, verbose("R%d !read_ok\n", regno); return -EACCES; } + mark_reg_read(&env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose("frame pointer is read only\n"); return -EACCES; } + regs[regno].live |= REG_LIVE_WRITTEN; if (t == DST_OP) mark_reg_unknown(regs, regno); } @@ -695,7 +716,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) static int check_stack_write(struct bpf_verifier_state *state, int off, int size, int value_regno) { - int i; + int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ @@ -710,15 +731,14 @@ static int check_stack_write(struct bpf_verifier_state *state, int off, } /* save register state */ - state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = - state->regs[value_regno]; + state->spilled_regs[spi] = state->regs[value_regno]; + state->spilled_regs[spi].live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; } else { /* regular write of data into stack */ - state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = - (struct bpf_reg_state) {}; + state->spilled_regs[spi] = (struct bpf_reg_state) {}; for (i = 0; i < size; i++) state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; @@ -726,11 +746,26 @@ static int check_stack_write(struct bpf_verifier_state *state, int off, return 0; } +static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) +{ + struct bpf_verifier_state *parent = state->parent; + + while (parent) { + /* if read wasn't screened by an earlier write ... */ + if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN) + break; + /* ... then we depend on parent's value */ + parent->spilled_regs[slot].live |= REG_LIVE_READ; + state = parent; + parent = state->parent; + } +} + static int check_stack_read(struct bpf_verifier_state *state, int off, int size, int value_regno) { u8 *slot_type; - int i; + int i, spi; slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; @@ -746,10 +781,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } } - if (value_regno >= 0) + spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; + + if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = - state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE]; + state->regs[value_regno] = state->spilled_regs[spi]; + mark_stack_slot_read(state, spi); + } return 0; } else { for (i = 0; i < size; i++) { @@ -1167,7 +1205,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; int err; if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || @@ -1177,12 +1214,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins } /* check src1 operand */ - err = check_reg_arg(regs, insn->src_reg, SRC_OP); + err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; /* check src2 operand */ - err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; @@ -1297,10 +1334,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_DONTCARE) return 0; - if (type == NOT_INIT) { - verbose("R%d !read_ok\n", regno); - return -EACCES; - } + err = check_reg_arg(env, regno, SRC_OP); + if (err) + return err; if (arg_type == ARG_ANYTHING) { if (is_pointer_value(env, regno)) { @@ -1639,10 +1675,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } /* reset caller saved regs */ - for (i = 0; i < CALLER_SAVED_REGS; i++) + for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } - /* update return register */ + /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(regs, BPF_REG_0); @@ -2250,7 +2288,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check src operand */ - err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; @@ -2261,7 +2299,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - err = check_reg_arg(regs, insn->dst_reg, DST_OP); + err = check_reg_arg(env, insn->dst_reg, DST_OP); if (err) return err; @@ -2274,7 +2312,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check src operand */ - err = check_reg_arg(regs, insn->src_reg, SRC_OP); + err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; } else { @@ -2285,7 +2323,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - err = check_reg_arg(regs, insn->dst_reg, DST_OP); + err = check_reg_arg(env, insn->dst_reg, DST_OP); if (err) return err; @@ -2328,7 +2366,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } /* check src1 operand */ - err = check_reg_arg(regs, insn->src_reg, SRC_OP); + err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; } else { @@ -2339,7 +2377,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check src2 operand */ - err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; @@ -2360,7 +2398,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; @@ -2717,7 +2755,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } /* check src1 operand */ - err = check_reg_arg(regs, insn->src_reg, SRC_OP); + err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; @@ -2734,7 +2772,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } /* check src2 operand */ - err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; @@ -2851,7 +2889,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - err = check_reg_arg(regs, insn->dst_reg, DST_OP); + err = check_reg_arg(env, insn->dst_reg, DST_OP); if (err) return err; @@ -2917,7 +2955,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check whether implicit source operand (register R6) is readable */ - err = check_reg_arg(regs, BPF_REG_6, SRC_OP); + err = check_reg_arg(env, BPF_REG_6, SRC_OP); if (err) return err; @@ -2928,17 +2966,20 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) if (mode == BPF_IND) { /* check explicit source operand */ - err = check_reg_arg(regs, insn->src_reg, SRC_OP); + err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; } /* reset caller saved regs to unreadable */ - for (i = 0; i < CALLER_SAVED_REGS; i++) + for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } /* mark destination R0 register as readable, since it contains - * the value fetched from the packet + * the value fetched from the packet. + * Already marked as written above. */ mark_reg_unknown(regs, BPF_REG_0); return 0; @@ -3194,7 +3235,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, bool varlen_map_access, struct idpair *idmap) { - if (memcmp(rold, rcur, sizeof(*rold)) == 0) + if (!(rold->live & REG_LIVE_READ)) + /* explored state didn't use this */ + return true; + + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) return true; if (rold->type == NOT_INIT) @@ -3372,10 +3417,56 @@ out_free: return ret; } +static bool do_propagate_liveness(const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent) +{ + bool touched = false; /* any changes made? */ + int i; + + if (!parent) + return touched; + /* Propagate read liveness of registers... */ + BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); + /* We don't need to worry about FP liveness because it's read-only */ + for (i = 0; i < BPF_REG_FP; i++) { + if (parent->regs[i].live & REG_LIVE_READ) + continue; + if (state->regs[i].live == REG_LIVE_READ) { + parent->regs[i].live |= REG_LIVE_READ; + touched = true; + } + } + /* ... and stack slots */ + for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) { + if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + continue; + if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + continue; + if (parent->spilled_regs[i].live & REG_LIVE_READ) + continue; + if (state->spilled_regs[i].live == REG_LIVE_READ) { + parent->regs[i].live |= REG_LIVE_READ; + touched = true; + } + } + return touched; +} + +static void propagate_liveness(const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent) +{ + while (do_propagate_liveness(state, parent)) { + /* Something changed, so we need to feed those changes onward */ + state = parent; + parent = state->parent; + } +} + static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; + int i; sl = env->explored_states[insn_idx]; if (!sl) @@ -3385,11 +3476,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(env, &sl->state, &env->cur_state)) + if (states_equal(env, &sl->state, &env->cur_state)) { /* reached equivalent register/stack state, - * prune the search + * prune the search. + * Registers read by the continuation are read by us. */ + propagate_liveness(&sl->state, &env->cur_state); return 1; + } sl = sl->next; } @@ -3407,6 +3501,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; + /* connect new state to parentage chain */ + env->cur_state.parent = &new_sl->state; + /* clear liveness marks in current state */ + for (i = 0; i < BPF_REG_FP; i++) + env->cur_state.regs[i].live = REG_LIVE_NONE; + for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) + if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL) + env->cur_state.spilled_regs[i].live = REG_LIVE_NONE; return 0; } @@ -3430,6 +3532,7 @@ static int do_check(struct bpf_verifier_env *env) bool do_print_state = false; init_reg_state(regs); + state->parent = NULL; insn_idx = 0; env->varlen_map_value_access = false; for (;;) { @@ -3500,11 +3603,11 @@ static int do_check(struct bpf_verifier_env *env) /* check for reserved fields is already done */ /* check src operand */ - err = check_reg_arg(regs, insn->src_reg, SRC_OP); + err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; - err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; @@ -3554,11 +3657,11 @@ static int do_check(struct bpf_verifier_env *env) } /* check src1 operand */ - err = check_reg_arg(regs, insn->src_reg, SRC_OP); + err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; /* check src2 operand */ - err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; @@ -3589,7 +3692,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } /* check src operand */ - err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; @@ -3643,7 +3746,7 @@ static int do_check(struct bpf_verifier_env *env) * of bpf_exit, which means that program wrote * something into it earlier */ - err = check_reg_arg(regs, BPF_REG_0, SRC_OP); + err = check_reg_arg(env, BPF_REG_0, SRC_OP); if (err) return err; -- cgit v1.2.3 From 88a5c690b66110ad255380d8f629c629cf6ca559 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Aug 2017 01:45:33 +0200 Subject: bpf: fix bpf_trace_printk on 32 bit archs James reported that on MIPS32 bpf_trace_printk() is currently broken while MIPS64 works fine: bpf_trace_printk() uses conditional operators to attempt to pass different types to __trace_printk() depending on the format operators. This doesn't work as intended on 32-bit architectures where u32 and long are passed differently to u64, since the result of C conditional operators follows the "usual arithmetic conversions" rules, such that the values passed to __trace_printk() will always be u64 [causing issues later in the va_list handling for vscnprintf()]. For example the samples/bpf/tracex5 test printed lines like below on MIPS32, where the fd and buf have come from the u64 fd argument, and the size from the buf argument: [...] 1180.941542: 0x00000001: write(fd=1, buf= (null), size=6258688) Instead of this: [...] 1625.616026: 0x00000001: write(fd=1, buf=009e4000, size=512) One way to get it working is to expand various combinations of argument types into 8 different combinations for 32 bit and 64 bit kernels. Fix tested by James on MIPS32 and MIPS64 as well that it resolves the issue. Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()") Reported-by: James Hogan Tested-by: James Hogan Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 37385193a608..dc498b605d5d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, fmt_cnt++; } - return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, - mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, - mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); +/* Horrid workaround for getting va_list handling working with different + * argument type combinations generically for 32 and 64 bit archs. + */ +#define __BPF_TP_EMIT() __BPF_ARG3_TP() +#define __BPF_TP(...) \ + __trace_printk(1 /* Fake ip will not be printed. */, \ + fmt, ##__VA_ARGS__) + +#define __BPF_ARG1_TP(...) \ + ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_TP(arg1, ##__VA_ARGS__) \ + : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ + : __BPF_TP((u32)arg1, ##__VA_ARGS__))) + +#define __BPF_ARG2_TP(...) \ + ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ + : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ + : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) + +#define __BPF_ARG3_TP(...) \ + ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ + : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ + : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) + + return __BPF_TP_EMIT(); } static const struct bpf_func_proto bpf_trace_printk_proto = { -- cgit v1.2.3 From b19af510e67e6ca696b8721f45c148119437307c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 14 Aug 2017 16:53:16 +0200 Subject: genirq/irq_sim: Add a simple interrupt simulator framework Implement a simple, irq_work-based framework for simulating interrupts. Currently the API exposes routines for initializing and deinitializing the simulator object, enqueueing the interrupts and retrieving the allocated interrupt numbers based on the offset of the dummy interrupt in the simulator struct. Signed-off-by: Bartosz Golaszewski Reviewed-by: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Jonathan Corbet Cc: Marc Zyngier Cc: Linus Walleij Cc: linux-doc@vger.kernel.org Cc: linux-gpio@vger.kernel.org Cc: Bamvor Jian Zhang Cc: Jonathan Cameron Link: http://lkml.kernel.org/r/20170814145318.6495-2-brgl@bgdev.pl Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 5 +++ kernel/irq/Makefile | 1 + kernel/irq/irq_sim.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 kernel/irq/irq_sim.c (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 27c4e774071c..1d06af787932 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -63,6 +63,11 @@ config GENERIC_IRQ_CHIP config IRQ_DOMAIN bool +# Support for simulated interrupts +config IRQ_SIM + bool + select IRQ_WORK + # Support for hierarchical irq domains config IRQ_DOMAIN_HIERARCHY bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index e4aef7351f2b..1970cafe8f2a 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_IRQ_TIMINGS) += timings.o obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o +obj-$(CONFIG_IRQ_SIM) += irq_sim.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c new file mode 100644 index 000000000000..31a2c12a79ae --- /dev/null +++ b/kernel/irq/irq_sim.c @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2017 Bartosz Golaszewski + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include +#include + +static void irq_sim_irqmask(struct irq_data *data) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + + irq_ctx->enabled = false; +} + +static void irq_sim_irqunmask(struct irq_data *data) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + + irq_ctx->enabled = true; +} + +static struct irq_chip irq_sim_irqchip = { + .name = "irq_sim", + .irq_mask = irq_sim_irqmask, + .irq_unmask = irq_sim_irqunmask, +}; + +static void irq_sim_handle_irq(struct irq_work *work) +{ + struct irq_sim_work_ctx *work_ctx; + + work_ctx = container_of(work, struct irq_sim_work_ctx, work); + handle_simple_irq(irq_to_desc(work_ctx->irq)); +} + +/** + * irq_sim_init - Initialize the interrupt simulator: allocate a range of + * dummy interrupts. + * + * @sim: The interrupt simulator object to initialize. + * @num_irqs: Number of interrupts to allocate + * + * Returns 0 on success and a negative error number on failure. + */ +int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) +{ + int i; + + sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL); + if (!sim->irqs) + return -ENOMEM; + + sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0); + if (sim->irq_base < 0) { + kfree(sim->irqs); + return sim->irq_base; + } + + for (i = 0; i < num_irqs; i++) { + sim->irqs[i].irqnum = sim->irq_base + i; + sim->irqs[i].enabled = false; + irq_set_chip(sim->irq_base + i, &irq_sim_irqchip); + irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]); + irq_set_handler(sim->irq_base + i, &handle_simple_irq); + irq_modify_status(sim->irq_base + i, + IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); + } + + init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); + sim->irq_count = num_irqs; + + return 0; +} +EXPORT_SYMBOL_GPL(irq_sim_init); + +/** + * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt + * descriptors and allocated memory. + * + * @sim: The interrupt simulator to tear down. + */ +void irq_sim_fini(struct irq_sim *sim) +{ + irq_work_sync(&sim->work_ctx.work); + irq_free_descs(sim->irq_base, sim->irq_count); + kfree(sim->irqs); +} +EXPORT_SYMBOL_GPL(irq_sim_fini); + +/** + * irq_sim_fire - Enqueue an interrupt. + * + * @sim: The interrupt simulator object. + * @offset: Offset of the simulated interrupt which should be fired. + */ +void irq_sim_fire(struct irq_sim *sim, unsigned int offset) +{ + if (sim->irqs[offset].enabled) { + sim->work_ctx.irq = irq_sim_irqnum(sim, offset); + irq_work_queue(&sim->work_ctx.work); + } +} +EXPORT_SYMBOL_GPL(irq_sim_fire); + +/** + * irq_sim_irqnum - Get the allocated number of a dummy interrupt. + * + * @sim: The interrupt simulator object. + * @offset: Offset of the simulated interrupt for which to retrieve + * the number. + */ +int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset) +{ + return sim->irqs[offset].irqnum; +} +EXPORT_SYMBOL_GPL(irq_sim_irqnum); -- cgit v1.2.3 From 44e72c7ebf294043cfe276f7328b8c0e6a3e50e9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 14 Aug 2017 16:53:17 +0200 Subject: genirq/irq_sim: Add a devres variant of irq_sim_init() Add a resource managed version of irq_sim_init(). This can be conveniently used in device drivers. Signed-off-by: Bartosz Golaszewski Acked-by: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Jonathan Corbet Cc: Marc Zyngier Cc: Linus Walleij Cc: linux-doc@vger.kernel.org Cc: linux-gpio@vger.kernel.org Cc: Bamvor Jian Zhang Cc: Jonathan Cameron Link: http://lkml.kernel.org/r/20170814145318.6495-3-brgl@bgdev.pl Signed-off-by: Thomas Gleixner --- kernel/irq/irq_sim.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 31a2c12a79ae..24caabf1a0f7 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -10,6 +10,10 @@ #include #include +struct irq_sim_devres { + struct irq_sim *sim; +}; + static void irq_sim_irqmask(struct irq_data *data) { struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); @@ -92,6 +96,45 @@ void irq_sim_fini(struct irq_sim *sim) } EXPORT_SYMBOL_GPL(irq_sim_fini); +static void devm_irq_sim_release(struct device *dev, void *res) +{ + struct irq_sim_devres *this = res; + + irq_sim_fini(this->sim); +} + +/** + * irq_sim_init - Initialize the interrupt simulator for a managed device. + * + * @dev: Device to initialize the simulator object for. + * @sim: The interrupt simulator object to initialize. + * @num_irqs: Number of interrupts to allocate + * + * Returns 0 on success and a negative error number on failure. + */ +int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, + unsigned int num_irqs) +{ + struct irq_sim_devres *dr; + int rv; + + dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rv = irq_sim_init(sim, num_irqs); + if (rv) { + devres_free(dr); + return rv; + } + + dr->sim = sim; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL_GPL(devm_irq_sim_init); + /** * irq_sim_fire - Enqueue an interrupt. * -- cgit v1.2.3 From 9c8783201cb58e9af8ddeb0cc68f37b0a44ca16c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 16 Aug 2017 13:12:02 -0400 Subject: sched/completion: Document that reinit_completion() must be called after complete_all() The complete_all() function modifies the completion's "done" variable to UINT_MAX, and no other caller (wait_for_completion(), etc) will modify it back to zero. That means that any call to complete_all() must have a reinit_completion() before that completion can be used again. Document this fact by the complete_all() function. Also document that completion_done() will always return true if complete_all() is called. Signed-off-by: Steven Rostedt (VMware) Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170816131202.195c2f4b@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/completion.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae9bf2f..2950f446820d 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -47,6 +47,13 @@ EXPORT_SYMBOL(complete); * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. + * + * Since complete_all() sets the completion of @x permanently to done + * to allow multiple waiters to finish, a call to reinit_completion() + * must be used on @x if @x is to be used again. The code must make + * sure that all waiters have woken and finished before reinitializing + * @x. Also note that the function completion_done() can not be used + * to know if there are still waiters after complete_all() has been called. */ void complete_all(struct completion *x) { @@ -297,6 +304,7 @@ EXPORT_SYMBOL(try_wait_for_completion); * Return: 0 if there are waiters (wait_for_completion() in progress) * 1 if there are no waiters. * + * Note, this will always return true if complete_all() was called on @X. */ bool completion_done(struct completion *x) { -- cgit v1.2.3 From a6f6df69c48b86cd84f36c70593eb4968fceb34a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:32:22 -0700 Subject: bpf: export bpf_prog_inc_not_zero bpf_prog_inc_not_zero will be used by upcoming sockmap patches this patch simply exports it so we can pull it in. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fbe09a0cccf4..17e29f596de1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -911,7 +911,7 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) EXPORT_SYMBOL_GPL(bpf_prog_inc); /* prog_idr_lock should have been held */ -static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) +struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; @@ -927,6 +927,7 @@ static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { -- cgit v1.2.3 From 174a79ff9515f400b9a6115643dafd62a635b7e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:32:47 -0700 Subject: bpf: sockmap with sk redirect support Recently we added a new map type called dev map used to forward XDP packets between ports (6093ec2dc313). This patches introduces a similar notion for sockets. A sockmap allows users to add participating sockets to a map. When sockets are added to the map enough context is stored with the map entry to use the entry with a new helper bpf_sk_redirect_map(map, key, flags) This helper (analogous to bpf_redirect_map in XDP) is given the map and an entry in the map. When called from a sockmap program, discussed below, the skb will be sent on the socket using skb_send_sock(). With the above we need a bpf program to call the helper from that will then implement the send logic. The initial site implemented in this series is the recv_sock hook. For this to work we implemented a map attach command to add attributes to a map. In sockmap we add two programs a parse program and a verdict program. The parse program uses strparser to build messages and pass them to the verdict program. The parse programs use the normal strparser semantics. The verdict program is of type SK_SKB. The verdict program returns a verdict SK_DROP, or SK_REDIRECT for now. Additional actions may be added later. When SK_REDIRECT is returned, expected when bpf program uses bpf_sk_redirect_map(), the sockmap logic will consult per cpu variables set by the helper routine and pull the sock entry out of the sock map. This pattern follows the existing redirect logic in cls and xdp programs. This gives the flow, recv_sock -> str_parser (parse_prog) -> verdict_prog -> skb_send_sock \ -> kfree_skb As an example use case a message based load balancer may use specific logic in the verdict program to select the sock to send on. Sample programs are provided in future patches that hopefully illustrate the user interfaces. Also selftests are in follow-on patches. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 2 +- kernel/bpf/sockmap.c | 792 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 51 +++- kernel/bpf/verifier.c | 14 + 4 files changed, 857 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/sockmap.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 2f0bcda40e90..aa24287db888 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_NET),y) -obj-$(CONFIG_BPF_SYSCALL) += devmap.o +obj-$(CONFIG_BPF_SYSCALL) += devmap.o sockmap.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c new file mode 100644 index 000000000000..792f0addfafa --- /dev/null +++ b/kernel/bpf/sockmap.c @@ -0,0 +1,792 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* A BPF sock_map is used to store sock objects. This is primarly used + * for doing socket redirect with BPF helper routines. + * + * A sock map may have two BPF programs attached to it, a program used + * to parse packets and a program to provide a verdict and redirect + * decision on the packet. If no BPF parse program is provided it is + * assumed that every skb is a "message" (skb->len). Otherwise the + * parse program is attached to strparser and used to build messages + * that may span multiple skbs. The verdict program will either select + * a socket to send/receive the skb on or provide the drop code indicating + * the skb should be dropped. More actions may be added later as needed. + * The default program will drop packets. + * + * For reference this program is similar to devmap used in XDP context + * reviewing these together may be useful. For an example please review + * ./samples/bpf/sockmap/. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct bpf_stab { + struct bpf_map map; + struct sock **sock_map; + struct bpf_prog *bpf_parse; + struct bpf_prog *bpf_verdict; + refcount_t refcnt; +}; + +enum smap_psock_state { + SMAP_TX_RUNNING, +}; + +struct smap_psock { + struct rcu_head rcu; + + /* datapath variables */ + struct sk_buff_head rxqueue; + bool strp_enabled; + + /* datapath error path cache across tx work invocations */ + int save_rem; + int save_off; + struct sk_buff *save_skb; + + struct strparser strp; + struct bpf_prog *bpf_parse; + struct bpf_prog *bpf_verdict; + struct bpf_stab *stab; + + /* Back reference used when sock callback trigger sockmap operations */ + int key; + struct sock *sock; + unsigned long state; + + struct work_struct tx_work; + struct work_struct gc_work; + + void (*save_data_ready)(struct sock *sk); + void (*save_write_space)(struct sock *sk); + void (*save_state_change)(struct sock *sk); +}; + +static inline struct smap_psock *smap_psock_sk(const struct sock *sk) +{ + return (struct smap_psock *)rcu_dereference_sk_user_data(sk); +} + +static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); + int rc; + + if (unlikely(!prog)) + return SK_DROP; + + skb_orphan(skb); + skb->sk = psock->sock; + bpf_compute_data_end(skb); + rc = (*prog->bpf_func)(skb, prog->insnsi); + skb->sk = NULL; + + return rc; +} + +static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sock; + int rc; + + /* Because we use per cpu values to feed input from sock redirect + * in BPF program to do_sk_redirect_map() call we need to ensure we + * are not preempted. RCU read lock is not sufficient in this case + * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. + */ + preempt_disable(); + rc = smap_verdict_func(psock, skb); + switch (rc) { + case SK_REDIRECT: + sock = do_sk_redirect_map(); + preempt_enable(); + if (likely(sock)) { + struct smap_psock *peer = smap_psock_sk(sock); + + if (likely(peer && + test_bit(SMAP_TX_RUNNING, &peer->state) && + sk_stream_memory_free(peer->sock))) { + peer->sock->sk_wmem_queued += skb->truesize; + sk_mem_charge(peer->sock, skb->truesize); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } + } + /* Fall through and free skb otherwise */ + case SK_DROP: + default: + preempt_enable(); + kfree_skb(skb); + } +} + +static void smap_report_sk_error(struct smap_psock *psock, int err) +{ + struct sock *sk = psock->sock; + + sk->sk_err = err; + sk->sk_error_report(sk); +} + +static void smap_release_sock(struct sock *sock); + +/* Called with lock_sock(sk) held */ +static void smap_state_change(struct sock *sk) +{ + struct smap_psock *psock; + struct sock *osk; + + rcu_read_lock(); + + /* Allowing transitions into an established syn_recv states allows + * for early binding sockets to a smap object before the connection + * is established. + */ + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + break; + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + case TCP_LAST_ACK: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + case TCP_LISTEN: + break; + case TCP_CLOSE: + /* Only release if the map entry is in fact the sock in + * question. There is a case where the operator deletes + * the sock from the map, but the TCP sock is closed before + * the psock is detached. Use cmpxchg to verify correct + * sock is removed. + */ + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + break; + osk = cmpxchg(&psock->stab->sock_map[psock->key], sk, NULL); + if (osk == sk) + smap_release_sock(sk); + break; + default: + smap_report_sk_error(psock, EPIPE); + break; + } + rcu_read_unlock(); +} + +static void smap_read_sock_strparser(struct strparser *strp, + struct sk_buff *skb) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = container_of(strp, struct smap_psock, strp); + smap_do_verdict(psock, skb); + rcu_read_unlock(); +} + +/* Called with lock held on socket */ +static void smap_data_ready(struct sock *sk) +{ + struct smap_psock *psock; + + write_lock_bh(&sk->sk_callback_lock); + psock = smap_psock_sk(sk); + if (likely(psock)) + strp_data_ready(&psock->strp); + write_unlock_bh(&sk->sk_callback_lock); +} + +static void smap_tx_work(struct work_struct *w) +{ + struct smap_psock *psock; + struct sk_buff *skb; + int rem, off, n; + + psock = container_of(w, struct smap_psock, tx_work); + + /* lock sock to avoid losing sk_socket at some point during loop */ + lock_sock(psock->sock); + if (psock->save_skb) { + skb = psock->save_skb; + rem = psock->save_rem; + off = psock->save_off; + psock->save_skb = NULL; + goto start; + } + + while ((skb = skb_dequeue(&psock->rxqueue))) { + rem = skb->len; + off = 0; +start: + do { + if (likely(psock->sock->sk_socket)) + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + else + n = -EINVAL; + if (n <= 0) { + if (n == -EAGAIN) { + /* Retry when space is available */ + psock->save_skb = skb; + psock->save_rem = rem; + psock->save_off = off; + goto out; + } + /* Hard errors break pipe and stop xmit */ + smap_report_sk_error(psock, n ? -n : EPIPE); + clear_bit(SMAP_TX_RUNNING, &psock->state); + sk_mem_uncharge(psock->sock, skb->truesize); + psock->sock->sk_wmem_queued -= skb->truesize; + kfree_skb(skb); + goto out; + } + rem -= n; + off += n; + } while (rem); + sk_mem_uncharge(psock->sock, skb->truesize); + psock->sock->sk_wmem_queued -= skb->truesize; + kfree_skb(skb); + } +out: + release_sock(psock->sock); +} + +static void smap_write_space(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) + schedule_work(&psock->tx_work); + rcu_read_unlock(); +} + +static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + if (!psock->strp_enabled) + goto out; + sk->sk_data_ready = psock->save_data_ready; + sk->sk_write_space = psock->save_write_space; + sk->sk_state_change = psock->save_state_change; + psock->save_data_ready = NULL; + psock->save_write_space = NULL; + psock->save_state_change = NULL; + strp_stop(&psock->strp); + psock->strp_enabled = false; +out: + write_unlock_bh(&sk->sk_callback_lock); +} + +static void smap_destroy_psock(struct rcu_head *rcu) +{ + struct smap_psock *psock = container_of(rcu, + struct smap_psock, rcu); + + /* Now that a grace period has passed there is no longer + * any reference to this sock in the sockmap so we can + * destroy the psock, strparser, and bpf programs. But, + * because we use workqueue sync operations we can not + * do it in rcu context + */ + schedule_work(&psock->gc_work); +} + +static void smap_release_sock(struct sock *sock) +{ + struct smap_psock *psock = smap_psock_sk(sock); + + smap_stop_sock(psock, sock); + clear_bit(SMAP_TX_RUNNING, &psock->state); + rcu_assign_sk_user_data(sock, NULL); + call_rcu_sched(&psock->rcu, smap_destroy_psock); +} + +static int smap_parse_func_strparser(struct strparser *strp, + struct sk_buff *skb) +{ + struct smap_psock *psock; + struct bpf_prog *prog; + int rc; + + rcu_read_lock(); + psock = container_of(strp, struct smap_psock, strp); + prog = READ_ONCE(psock->bpf_parse); + + if (unlikely(!prog)) { + rcu_read_unlock(); + return skb->len; + } + + /* Attach socket for bpf program to use if needed we can do this + * because strparser clones the skb before handing it to a upper + * layer, meaning skb_orphan has been called. We NULL sk on the + * way out to ensure we don't trigger a BUG_ON in skb/sk operations + * later and because we are not charging the memory of this skb to + * any socket yet. + */ + skb->sk = psock->sock; + bpf_compute_data_end(skb); + rc = (*prog->bpf_func)(skb, prog->insnsi); + skb->sk = NULL; + rcu_read_unlock(); + return rc; +} + + +static int smap_read_sock_done(struct strparser *strp, int err) +{ + return err; +} + +static int smap_init_sock(struct smap_psock *psock, + struct sock *sk) +{ + struct strp_callbacks cb; + + memset(&cb, 0, sizeof(cb)); + cb.rcv_msg = smap_read_sock_strparser; + cb.parse_msg = smap_parse_func_strparser; + cb.read_sock_done = smap_read_sock_done; + return strp_init(&psock->strp, sk, &cb); +} + +static void smap_init_progs(struct smap_psock *psock, + struct bpf_stab *stab, + struct bpf_prog *verdict, + struct bpf_prog *parse) +{ + struct bpf_prog *orig_parse, *orig_verdict; + + orig_parse = xchg(&psock->bpf_parse, parse); + orig_verdict = xchg(&psock->bpf_verdict, verdict); + + if (orig_verdict) + bpf_prog_put(orig_verdict); + if (orig_parse) + bpf_prog_put(orig_parse); +} + +static void smap_start_sock(struct smap_psock *psock, struct sock *sk) +{ + if (sk->sk_data_ready == smap_data_ready) + return; + psock->save_data_ready = sk->sk_data_ready; + psock->save_write_space = sk->sk_write_space; + psock->save_state_change = sk->sk_state_change; + sk->sk_data_ready = smap_data_ready; + sk->sk_write_space = smap_write_space; + sk->sk_state_change = smap_state_change; + psock->strp_enabled = true; +} + +static void sock_map_remove_complete(struct bpf_stab *stab) +{ + bpf_map_area_free(stab->sock_map); + kfree(stab); +} + +static void smap_gc_work(struct work_struct *w) +{ + struct smap_psock *psock; + + psock = container_of(w, struct smap_psock, gc_work); + + /* no callback lock needed because we already detached sockmap ops */ + if (psock->strp_enabled) + strp_done(&psock->strp); + + cancel_work_sync(&psock->tx_work); + __skb_queue_purge(&psock->rxqueue); + + /* At this point all strparser and xmit work must be complete */ + if (psock->bpf_parse) + bpf_prog_put(psock->bpf_parse); + if (psock->bpf_verdict) + bpf_prog_put(psock->bpf_verdict); + + if (refcount_dec_and_test(&psock->stab->refcnt)) + sock_map_remove_complete(psock->stab); + + sock_put(psock->sock); + kfree(psock); +} + +static struct smap_psock *smap_init_psock(struct sock *sock, + struct bpf_stab *stab) +{ + struct smap_psock *psock; + + psock = kzalloc(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN); + if (!psock) + return ERR_PTR(-ENOMEM); + + psock->sock = sock; + skb_queue_head_init(&psock->rxqueue); + INIT_WORK(&psock->tx_work, smap_tx_work); + INIT_WORK(&psock->gc_work, smap_gc_work); + + rcu_assign_sk_user_data(sock, psock); + sock_hold(sock); + return psock; +} + +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) +{ + struct bpf_stab *stab; + int err = -EINVAL; + u64 cost; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags) + return ERR_PTR(-EINVAL); + + if (attr->value_size > KMALLOC_MAX_SIZE) + return ERR_PTR(-E2BIG); + + stab = kzalloc(sizeof(*stab), GFP_USER); + if (!stab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + stab->map.map_type = attr->map_type; + stab->map.key_size = attr->key_size; + stab->map.value_size = attr->value_size; + stab->map.max_entries = attr->max_entries; + stab->map.map_flags = attr->map_flags; + + /* make sure page count doesn't overflow */ + cost = (u64) stab->map.max_entries * sizeof(struct sock *); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_stab; + + stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(stab->map.pages); + if (err) + goto free_stab; + + stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * + sizeof(struct sock *)); + if (!stab->sock_map) + goto free_stab; + + refcount_set(&stab->refcnt, 1); + return &stab->map; +free_stab: + kfree(stab); + return ERR_PTR(err); +} + +static void sock_map_free(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int i; + + synchronize_rcu(); + + /* At this point no update, lookup or delete operations can happen. + * However, be aware we can still get a socket state event updates, + * and data ready callabacks that reference the psock from sk_user_data + * Also psock worker threads are still in-flight. So smap_release_sock + * will only free the psock after cancel_sync on the worker threads + * and a grace period expire to ensure psock is really safe to remove. + */ + rcu_read_lock(); + for (i = 0; i < stab->map.max_entries; i++) { + struct sock *sock; + + sock = xchg(&stab->sock_map[i], NULL); + if (!sock) + continue; + + smap_release_sock(sock); + } + rcu_read_unlock(); + + if (stab->bpf_verdict) + bpf_prog_put(stab->bpf_verdict); + if (stab->bpf_parse) + bpf_prog_put(stab->bpf_parse); + + if (refcount_dec_and_test(&stab->refcnt)) + sock_map_remove_complete(stab); +} + +static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (i >= stab->map.max_entries) { + *next = 0; + return 0; + } + + if (i == stab->map.max_entries - 1) + return -ENOENT; + + *next = i + 1; + return 0; +} + +struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + if (key >= map->max_entries) + return NULL; + + return READ_ONCE(stab->sock_map[key]); +} + +static int sock_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int k = *(u32 *)key; + struct sock *sock; + + if (k >= map->max_entries) + return -EINVAL; + + sock = xchg(&stab->sock_map[k], NULL); + if (!sock) + return -EINVAL; + + smap_release_sock(sock); + return 0; +} + +/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are + * done inside rcu critical sections. This ensures on updates that the psock + * will not be released via smap_release_sock() until concurrent updates/deletes + * complete. All operations operate on sock_map using cmpxchg and xchg + * operations to ensure we do not get stale references. Any reads into the + * map must be done with READ_ONCE() because of this. + * + * A psock is destroyed via call_rcu and after any worker threads are cancelled + * and syncd so we are certain all references from the update/lookup/delete + * operations as well as references in the data path are no longer in use. + * + * A psock object holds a refcnt on the sockmap it is attached to and this is + * not decremented until after a RCU grace period and garbage collection occurs. + * This ensures the map is not free'd until psocks linked to it are removed. The + * map link is used when the independent sock events trigger map deletion. + * + * Psocks may only participate in one sockmap at a time. Users that try to + * join a single sock to multiple maps will get an error. + * + * Last, but not least, it is possible the socket is closed while running + * an update on an existing psock. This will release the psock, but again + * not until the update has completed due to rcu grace period rules. + */ +static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 flags, u64 map_flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *verdict, *parse; + struct smap_psock *psock = NULL; + struct sock *old_sock, *sock; + u32 i = *(u32 *)key; + bool update = false; + int err = 0; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= stab->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags > BPF_SOCKMAP_STRPARSER)) + return -EINVAL; + + verdict = parse = NULL; + sock = READ_ONCE(stab->sock_map[i]); + + if (flags == BPF_EXIST || flags == BPF_ANY) { + if (!sock && flags == BPF_EXIST) { + return -ENOENT; + } else if (sock && sock != skops->sk) { + return -EINVAL; + } else if (sock) { + psock = smap_psock_sk(sock); + if (unlikely(!psock)) + return -EBUSY; + update = true; + } + } else if (sock && BPF_NOEXIST) { + return -EEXIST; + } + + /* reserve BPF programs early so can abort easily on failures */ + if (map_flags & BPF_SOCKMAP_STRPARSER) { + verdict = READ_ONCE(stab->bpf_verdict); + parse = READ_ONCE(stab->bpf_parse); + + if (!verdict || !parse) + return -ENOENT; + + /* bpf prog refcnt may be zero if a concurrent attach operation + * removes the program after the above READ_ONCE() but before + * we increment the refcnt. If this is the case abort with an + * error. + */ + verdict = bpf_prog_inc_not_zero(stab->bpf_verdict); + if (IS_ERR(verdict)) + return PTR_ERR(verdict); + + parse = bpf_prog_inc_not_zero(stab->bpf_parse); + if (IS_ERR(parse)) { + bpf_prog_put(verdict); + return PTR_ERR(parse); + } + } + + if (!psock) { + sock = skops->sk; + if (rcu_dereference_sk_user_data(sock)) + return -EEXIST; + psock = smap_init_psock(sock, stab); + if (IS_ERR(psock)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(psock); + } + psock->key = i; + psock->stab = stab; + refcount_inc(&stab->refcnt); + set_bit(SMAP_TX_RUNNING, &psock->state); + } + + if (map_flags & BPF_SOCKMAP_STRPARSER) { + write_lock_bh(&sock->sk_callback_lock); + if (psock->strp_enabled) + goto start_done; + err = smap_init_sock(psock, sock); + if (err) + goto out; + smap_init_progs(psock, stab, verdict, parse); + smap_start_sock(psock, sock); +start_done: + write_unlock_bh(&sock->sk_callback_lock); + } else if (update) { + smap_stop_sock(psock, sock); + } + + if (!update) { + old_sock = xchg(&stab->sock_map[i], skops->sk); + if (old_sock) + smap_release_sock(old_sock); + } + + return 0; +out: + write_unlock_bh(&sock->sk_callback_lock); + if (!update) + smap_release_sock(sock); + return err; +} + +static int sock_map_attach_prog(struct bpf_map *map, + struct bpf_prog *parse, + struct bpf_prog *verdict) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *_parse, *_verdict; + + _parse = xchg(&stab->bpf_parse, parse); + _verdict = xchg(&stab->bpf_verdict, verdict); + + if (_parse) + bpf_prog_put(_parse); + if (_verdict) + bpf_prog_put(_verdict); + + return 0; +} + +static void *sock_map_lookup(struct bpf_map *map, void *key) +{ + return NULL; +} + +static int sock_map_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_sock_ops_kern skops; + u32 fd = *(u32 *)value; + struct socket *socket; + int err; + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + skops.sk = socket->sk; + if (!skops.sk) { + fput(socket->file); + return -EINVAL; + } + + err = sock_map_ctx_update_elem(&skops, map, key, + flags, BPF_SOCKMAP_STRPARSER); + fput(socket->file); + return err; +} + +const struct bpf_map_ops sock_map_ops = { + .map_alloc = sock_map_alloc, + .map_free = sock_map_free, + .map_lookup_elem = sock_map_lookup, + .map_get_next_key = sock_map_get_next_key, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_map_delete_elem, + .map_attach = sock_map_attach_prog, +}; + +BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags, u64, map_flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return sock_map_ctx_update_elem(bpf_sock, map, key, flags, map_flags); +} + +const struct bpf_func_proto bpf_sock_map_update_proto = { + .func = bpf_sock_map_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 17e29f596de1..d2f2bdf71ffa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1087,7 +1087,50 @@ static int bpf_obj_get(const union bpf_attr *attr) #ifdef CONFIG_CGROUP_BPF -#define BPF_PROG_ATTACH_LAST_FIELD attach_flags +#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2 + +static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) +{ + struct bpf_prog *prog1, *prog2; + int ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int err; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (!map->ops->map_attach) { + fdput(f); + return -EOPNOTSUPP; + } + + prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog1)) { + fdput(f); + return PTR_ERR(prog1); + } + + prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype); + if (IS_ERR(prog2)) { + fdput(f); + bpf_prog_put(prog1); + return PTR_ERR(prog2); + } + + err = map->ops->map_attach(map, prog1, prog2); + if (err) { + fdput(f); + bpf_prog_put(prog1); + bpf_prog_put(prog2); + return PTR_ERR(map); + } + + fdput(f); + return err; +} static int bpf_prog_attach(const union bpf_attr *attr) { @@ -1116,10 +1159,16 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_SMAP_INGRESS: + ptype = BPF_PROG_TYPE_SK_SKB; + break; default: return -EINVAL; } + if (attr->attach_type == BPF_CGROUP_SMAP_INGRESS) + return sockmap_get_from_fd(attr, ptype); + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7dd96d064be1..a71bc0996572 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1522,6 +1522,12 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) goto error; + case BPF_MAP_TYPE_SOCKMAP: + if (func_id != BPF_FUNC_sk_redirect_map && + func_id != BPF_FUNC_sock_map_update && + func_id != BPF_FUNC_map_delete_elem) + goto error; + break; default: break; } @@ -1550,6 +1556,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_DEVMAP) goto error; break; + case BPF_FUNC_sk_redirect_map: + if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + goto error; + break; + case BPF_FUNC_sock_map_update: + if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + goto error; + break; default: break; } -- cgit v1.2.3 From 8a31db5615667956c513d205cfb06885c3ec6d0b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:33:09 -0700 Subject: bpf: add access to sock fields and pkt data from sk_skb programs Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a71bc0996572..958ba84a9995 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -886,6 +886,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_SK_SKB: if (meta) return meta->pkt_access; -- cgit v1.2.3 From cf9d01405925e3f8144c99d7bf7b184449794066 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 23:35:12 -0700 Subject: bpf: devmap: remove unnecessary value size check In the devmap alloc map logic we check to ensure that the sizeof the values are not greater than KMALLOC_MAX_SIZE. But, in the dev map case we ensure the value size is 4bytes earlier in the function because all values should be netdev ifindex values. The second check is harmless but is not needed so remove it. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 7192fb67d4de..18a72a8add43 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -83,12 +83,6 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags) return ERR_PTR(-EINVAL); - /* if value_size is bigger, the user space won't be able to - * access the elements. - */ - if (attr->value_size > KMALLOC_MAX_SIZE) - return ERR_PTR(-E2BIG); - dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From cf56e3b98c5358883c8df5ed8e04661481225a8f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 16 Aug 2017 15:02:12 -0700 Subject: bpf: sockmap state change warning fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit psock will uninitialized in default case we need to do the same psock lookup and check as in other branch. Fixes compile warning below. kernel/bpf/sockmap.c: In function ‘smap_state_change’: kernel/bpf/sockmap.c:156:21: warning: ‘psock’ may be used uninitialized in this function [-Wmaybe-uninitialized] struct smap_psock *psock; Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Reported-by: David Miller Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 792f0addfafa..f7e5e6cf124a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -188,6 +188,9 @@ static void smap_state_change(struct sock *sk) smap_release_sock(sk); break; default: + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + break; smap_report_sk_error(psock, EPIPE); break; } -- cgit v1.2.3 From 6bdc9c4c31c81688e19cb186d49be01bbb6a1618 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 16 Aug 2017 15:02:32 -0700 Subject: bpf: sock_map fixes for !CONFIG_BPF_SYSCALL and !STREAM_PARSER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolve issues with !CONFIG_BPF_SYSCALL and !STREAM_PARSER net/core/filter.c: In function ‘do_sk_redirect_map’: net/core/filter.c:1881:3: error: implicit declaration of function ‘__sock_map_lookup_elem’ [-Werror=implicit-function-declaration] sk = __sock_map_lookup_elem(ri->map, ri->ifindex); ^ net/core/filter.c:1881:6: warning: assignment makes pointer from integer without a cast [enabled by default] sk = __sock_map_lookup_elem(ri->map, ri->ifindex); Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Reported-by: Eric Dumazet Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 5 ++++- kernel/bpf/core.c | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index aa24287db888..897daa005b23 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,7 +3,10 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_NET),y) -obj-$(CONFIG_BPF_SYSCALL) += devmap.o sockmap.o +obj-$(CONFIG_BPF_SYSCALL) += devmap.o +ifeq ($(CONFIG_STREAM_PARSER),y) +obj-$(CONFIG_BPF_SYSCALL) += sockmap.o +endif endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c69e7f5bfde7..917cc04a0a94 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1438,6 +1438,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; +const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { -- cgit v1.2.3 From 7a46ec0e2f4850407de5e1d19a44edee6efa58ec Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 15 Aug 2017 09:19:24 -0700 Subject: locking/refcounts, x86/asm: Implement fast refcount overflow protection This implements refcount_t overflow protection on x86 without a noticeable performance impact, though without the fuller checking of REFCOUNT_FULL. This is done by duplicating the existing atomic_t refcount implementation but with normally a single instruction added to detect if the refcount has gone negative (e.g. wrapped past INT_MAX or below zero). When detected, the handler saturates the refcount_t to INT_MIN / 2. With this overflow protection, the erroneous reference release that would follow a wrap back to zero is blocked from happening, avoiding the class of refcount-overflow use-after-free vulnerabilities entirely. Only the overflow case of refcounting can be perfectly protected, since it can be detected and stopped before the reference is freed and left to be abused by an attacker. There isn't a way to block early decrements, and while REFCOUNT_FULL stops increment-from-zero cases (which would be the state _after_ an early decrement and stops potential double-free conditions), this fast implementation does not, since it would require the more expensive cmpxchg loops. Since the overflow case is much more common (e.g. missing a "put" during an error path), this protection provides real-world protection. For example, the two public refcount overflow use-after-free exploits published in 2016 would have been rendered unexploitable: http://perception-point.io/2016/01/14/analysis-and-exploitation-of-a-linux-kernel-vulnerability-cve-2016-0728/ http://cyseclabs.com/page?n=02012016 This implementation does, however, notice an unchecked decrement to zero (i.e. caller used refcount_dec() instead of refcount_dec_and_test() and it resulted in a zero). Decrements under zero are noticed (since they will have resulted in a negative value), though this only indicates that a use-after-free may have already happened. Such notifications are likely avoidable by an attacker that has already exploited a use-after-free vulnerability, but it's better to have them reported than allow such conditions to remain universally silent. On first overflow detection, the refcount value is reset to INT_MIN / 2 (which serves as a saturation value) and a report and stack trace are produced. When operations detect only negative value results (such as changing an already saturated value), saturation still happens but no notification is performed (since the value was already saturated). On the matter of races, since the entire range beyond INT_MAX but before 0 is negative, every operation at INT_MIN / 2 will trap, leaving no overflow-only race condition. As for performance, this implementation adds a single "js" instruction to the regular execution flow of a copy of the standard atomic_t refcount operations. (The non-"and_test" refcount_dec() function, which is uncommon in regular refcount design patterns, has an additional "jz" instruction to detect reaching exactly zero.) Since this is a forward jump, it is by default the non-predicted path, which will be reinforced by dynamic branch prediction. The result is this protection having virtually no measurable change in performance over standard atomic_t operations. The error path, located in .text.unlikely, saves the refcount location and then uses UD0 to fire a refcount exception handler, which resets the refcount, handles reporting, and returns to regular execution. This keeps the changes to .text size minimal, avoiding return jumps and open-coded calls to the error reporting routine. Example assembly comparison: refcount_inc() before: .text: ffffffff81546149: f0 ff 45 f4 lock incl -0xc(%rbp) refcount_inc() after: .text: ffffffff81546149: f0 ff 45 f4 lock incl -0xc(%rbp) ffffffff8154614d: 0f 88 80 d5 17 00 js ffffffff816c36d3 ... .text.unlikely: ffffffff816c36d3: 48 8d 4d f4 lea -0xc(%rbp),%rcx ffffffff816c36d7: 0f ff (bad) These are the cycle counts comparing a loop of refcount_inc() from 1 to INT_MAX and back down to 0 (via refcount_dec_and_test()), between unprotected refcount_t (atomic_t), fully protected REFCOUNT_FULL (refcount_t-full), and this overflow-protected refcount (refcount_t-fast): 2147483646 refcount_inc()s and 2147483647 refcount_dec_and_test()s: cycles protections atomic_t 82249267387 none refcount_t-fast 82211446892 overflow, untested dec-to-zero refcount_t-full 144814735193 overflow, untested dec-to-zero, inc-from-zero This code is a modified version of the x86 PAX_REFCOUNT atomic_t overflow defense from the last public patch of PaX/grsecurity, based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Thanks to PaX Team for various suggestions for improvement for repurposing this code to be a refcount-only protection. Signed-off-by: Kees Cook Reviewed-by: Josh Poimboeuf Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: David S. Miller Cc: Davidlohr Bueso Cc: Elena Reshetova Cc: Eric Biggers Cc: Eric W. Biederman Cc: Greg KH Cc: Hans Liljestrand Cc: James Bottomley Cc: Jann Horn Cc: Linus Torvalds Cc: Manfred Spraul Cc: Peter Zijlstra Cc: Rik van Riel Cc: Serge E. Hallyn Cc: Thomas Gleixner Cc: arozansk@redhat.com Cc: axboe@kernel.dk Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/20170815161924.GA133115@beast Signed-off-by: Ingo Molnar --- kernel/panic.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index a58932b41700..bdd18afa19a4 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -26,6 +26,7 @@ #include #include #include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif +#ifdef CONFIG_ARCH_HAS_REFCOUNT +void refcount_error_report(struct pt_regs *regs, const char *err) +{ + WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n", + err, (void *)instruction_pointer(regs), + current->comm, task_pid_nr(current), + from_kuid_munged(&init_user_ns, current_uid()), + from_kuid_munged(&init_user_ns, current_euid())); +} +#endif + core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); -- cgit v1.2.3 From 52fa5bc5cbba089f09bc2c372e3432f3f3e48051 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 17 Aug 2017 17:46:12 +0800 Subject: locking/lockdep: Explicitly initialize wq_barrier::done::map With the new lockdep crossrelease feature, which checks completions usage, a false positive is reported in the workqueue code: > Worker A : acquired of wfc.work -> wait for cpu_hotplug_lock to be released > Task B : acquired of cpu_hotplug_lock -> wait for lock#3 to be released > Task C : acquired of lock#3 -> wait for completion of barr->done > (Task C is in lru_add_drain_all_cpuslocked()) > Worker D : wait for wfc.work to be released -> will complete barr->done Such a dead lock can not happen because Task C's barr->done and Worker D's barr->done can not be the same instance. The reason of this false positive is we initialize all wq_barrier::done at insert_wq_barrier() via init_completion(), which makes them belong to the same lock class, therefore, impossible circles are reported. To fix this, explicitly initialize the lockdep map for wq_barrier::done in insert_wq_barrier(), so that the lock class key of wq_barrier::done is a subkey of the corresponding work_struct, as a result we won't build a dependency between a wq_barrier with a unrelated work, and we can differ wq barriers based on the related works, so the false positive above is avoided. Also define the empty lockdep_init_map_crosslock() for !CROSSRELEASE to make the code simple and away from unnecessary #ifdefs. Reported-by: Ingo Molnar Signed-off-by: Boqun Feng Cc: Byungchul Park Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170817094622.12915-1-boqun.feng@gmail.com Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e86733a8b344..f128b3becfe1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2476,7 +2476,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, */ INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); - init_completion(&barr->done); + + /* + * Explicitly init the crosslock for wq_barrier::done, make its lock + * key a subkey of the corresponding work. As a result we won't + * build a dependency between wq_barrier::done and unrelated work. + */ + lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map, + "(complete)wq_barr::done", + target->lockdep_map.key, 1); + __init_completion(&barr->done); barr->task = current; /* -- cgit v1.2.3 From 7e42776d5ed1fe9a941ed8876c5d15cd7cf5d89f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 May 2017 08:05:00 -0700 Subject: rcu: Drive TASKS_RCU directly off of PREEMPT The actual use of TASKS_RCU is only when PREEMPT, otherwise RCU-sched is used instead. This commit therefore makes synchronize_rcu_tasks() and call_rcu_tasks() available always, but mapped to synchronize_sched() and call_rcu_sched(), respectively, when !PREEMPT. This approach also allows some #ifdefs to be removed from rcutorture. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Reviewed-by: Masami Hiramatsu Acked-by: Ingo Molnar --- kernel/rcu/Kconfig | 3 +-- kernel/rcu/rcutorture.c | 17 +---------------- 2 files changed, 2 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index be90c945063f..9210379c0353 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -69,8 +69,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - bool - default n + def_bool PREEMPT select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b8f7f8ce8575..b284c861a511 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -696,8 +696,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks torture testing. */ @@ -735,24 +733,11 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -#define RCUTORTURE_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUTORTURE_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -1749,7 +1734,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &sched_ops, RCUTORTURE_TASKS_OPS + &sched_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) -- cgit v1.2.3 From ccdd29ffffa7246cb359b9408772858a15fc4ea5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 May 2017 08:51:48 -0700 Subject: rcu: Create reasonable API for do_exit() TASKS_RCU processing Currently, the exit-time support for TASKS_RCU is open-coded in do_exit(). This commit creates exit_tasks_rcu_start() and exit_tasks_rcu_finish() APIs for do_exit() use. This has the benefit of confining the use of the tasks_rcu_exit_srcu variable to one file, allowing it to become static. Signed-off-by: Paul E. McKenney --- kernel/exit.c | 7 ++----- kernel/rcu/update.c | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..d297c525f188 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -764,7 +764,6 @@ void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; - TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); kcov_task_exit(tsk); @@ -881,9 +880,7 @@ void __noreturn do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); - TASKS_RCU(preempt_disable()); - TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); - TASKS_RCU(preempt_enable()); + exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); @@ -918,7 +915,7 @@ void __noreturn do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); - TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); + exit_tasks_rcu_finish(); do_task_dead(); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 00e77c470017..5033b66d2753 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); /* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_SRCU(tasks_rcu_exit_srcu); +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void) mutex_unlock(&rcu_tasks_kthread_mutex); } +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) +{ + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + preempt_enable(); +} + #endif /* #ifdef CONFIG_TASKS_RCU */ #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From bedbb648efe178c8e4708ada272536744d0f8118 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Jun 2017 15:49:39 -0700 Subject: rcu: Add TPS() to event-traced strings Strings used in event tracing need to be specially handled, for example, using the TPS() macro. Without the TPS() macro, although output looks fine from within a running kernel, extracting traces from a crash dump produces garbage instead of strings. This commit therefore adds the TPS() macro to some unadorned strings that were passed to event-tracing macros. Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree_plugin.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bb9e6e43130f..14ba496a13cd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2073,7 +2073,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); @@ -2083,7 +2083,7 @@ wait_again: raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll")); } /* @@ -2111,7 +2111,7 @@ wait_again: schedule_timeout_interruptible(1); } else { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, - "WokeEmpty"); + TPS("WokeEmpty")); } goto wait_again; } @@ -2155,7 +2155,7 @@ wait_again: static void nocb_follower_wait(struct rcu_data *rdp) { for (;;) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); swait_event_interruptible(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { @@ -2163,7 +2163,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) return; } WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeEmpty"); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty")); } } @@ -2198,7 +2198,7 @@ static int rcu_nocb_kthread(void *arg) rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, -- cgit v1.2.3 From 7414fac050d5e0b64554b902f3955eabbebb6cb2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 Jun 2017 16:44:19 -0700 Subject: rcu: Move rcu.h to new trivial-function style This commit saves a few lines in kernel/rcu/rcu.h by moving to single-line definitions for trivial functions, instead of the old style where the two curly braces each get their own line. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 128 +++++++++---------------------------------------------- 1 file changed, 20 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 808b8c85f626..e4b43fef89f5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -356,22 +356,10 @@ do { \ #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ -static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ -{ - return true; -} -static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ -{ - return false; -} - -static inline void rcu_expedite_gp(void) -{ -} - -static inline void rcu_unexpedite_gp(void) -{ -} +static inline bool rcu_gp_is_normal(void) { return true; } +static inline bool rcu_gp_is_expedited(void) { return false; } +static inline void rcu_expedite_gp(void) { } +static inline void rcu_unexpedite_gp(void) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ @@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *gpnum = 0; *completed = 0; } -static inline void rcutorture_record_test_transition(void) -{ -} -static inline void rcutorture_record_progress(unsigned long vernum) -{ -} +static inline void rcutorture_record_test_transition(void) { } +static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU - -/* - * Return the number of grace periods started. - */ -static inline unsigned long rcu_batches_started(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods started. - */ -static inline unsigned long rcu_batches_started_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods started. - */ -static inline unsigned long rcu_batches_started_sched(void) -{ - return 0; -} - -/* - * Return the number of grace periods completed. - */ -static inline unsigned long rcu_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods completed. - */ -static inline unsigned long rcu_batches_completed_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods completed. - */ -static inline unsigned long rcu_batches_completed_sched(void) -{ - return 0; -} - -/* - * Return the number of expedited grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of expedited sched grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed_sched(void) -{ - return 0; -} - -static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return 0; -} - -static inline void rcu_force_quiescent_state(void) -{ -} - -static inline void rcu_bh_force_quiescent_state(void) -{ -} - -static inline void rcu_sched_force_quiescent_state(void) -{ -} - -static inline void show_rcu_gp_kthreads(void) -{ -} - +static inline unsigned long rcu_batches_started(void) { return 0; } +static inline unsigned long rcu_batches_started_bh(void) { return 0; } +static inline unsigned long rcu_batches_started_sched(void) { return 0; } +static inline unsigned long rcu_batches_completed(void) { return 0; } +static inline unsigned long rcu_batches_completed_bh(void) { return 0; } +static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } +static inline unsigned long +srcu_batches_completed(struct srcu_struct *sp) { return 0; } +static inline void rcu_force_quiescent_state(void) { } +static inline void rcu_bh_force_quiescent_state(void) { } +static inline void rcu_sched_force_quiescent_state(void) { } +static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; -- cgit v1.2.3 From c5ebe66ce774126b888617cab658f6556d23365e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jun 2017 10:32:23 -0700 Subject: rcu: Add event tracing to ->gp_tasks update at GP start There is currently event tracing to track when a task is preempted within a preemptible RCU read-side critical section, and also when that task subsequently reaches its outermost rcu_read_unlock(), but none indicating when a new grace period starts when that grace period must wait on pre-existing readers that have been been preempted at least once since the beginning of their current RCU read-side critical sections. This commit therefore adds an event trace at grace-period start in the case where there are such readers. Note that only the first reader in the list is traced. Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree_plugin.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 14ba496a13cd..3e3f92e981a1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -636,10 +636,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + struct task_struct *t; + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (rcu_preempt_has_tasks(rnp)) + if (rcu_preempt_has_tasks(rnp)) { rnp->gp_tasks = rnp->blkd_tasks.next; + t = container_of(rnp->gp_tasks, struct task_struct, + rcu_node_entry); + trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), + rnp->gpnum, t->pid); + } WARN_ON_ONCE(rnp->qsmask); } -- cgit v1.2.3 From d5374226c3e444239e063f005dfb59cae4390db4 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 20 Jun 2017 14:45:47 -0700 Subject: rcu: Use idle versions of swait to make idle-hack clear These RCU waits were set to use interruptible waits to avoid the kthreads contributing to system load average, even though they are not interruptible as they are spawned from a kthread. Use the new TASK_IDLE swaits which makes our goal clear, and removes confusion about these paths possibly being interruptible -- they are not. When the system is idle the RCU grace-period kthread will spend all its time blocked inside the swait_event_interruptible(). If the interruptible() was not used, then this kthread would contribute to the load average. This means that an idle system would have a load average of 2 (or 3 if PREEMPT=y), rather than the load average of 0 that almost fifty years of UNIX has conditioned sysadmins to expect. The same argument applies to swait_event_interruptible_timeout() use. The RCU grace-period kthread spends its time blocked inside this call while waiting for grace periods to complete. In particular, if there was only one busy CPU, but that CPU was frequently invoking call_rcu(), then the RCU grace-period kthread would spend almost all its time blocked inside the swait_event_interruptible_timeout(). This would mean that the load average would be 2 rather than the expected 1 for the single busy CPU. Acked-by: "Eric W. Biederman" Tested-by: Paul E. McKenney Signed-off-by: Luis R. Rodriguez Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..2b13d9679f57 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2067,8 +2067,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* - * Helper function for wait_event_interruptible_timeout() wakeup - * at force-quiescent-state time. + * Helper function for swait_event_idle() wakeup at force-quiescent-state + * time. */ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) { @@ -2206,9 +2206,8 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_interruptible(rsp->gp_wq, - READ_ONCE(rsp->gp_flags) & - RCU_GP_FLAG_INIT); + swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & + RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) @@ -2239,7 +2238,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_idle_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ -- cgit v1.2.3 From d8db2e86d8ba20ef7eb8ba8627129ade40192838 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Jun 2017 13:22:17 -0700 Subject: rcu: Add TPS() protection for _rcu_barrier_trace strings The _rcu_barrier_trace() function is a wrapper for trace_rcu_barrier(), which needs TPS() protection for strings passed through the second argument. However, it has escaped prior TPS()-ification efforts because it _rcu_barrier_trace() does not start with "trace_". This commit therefore adds the needed TPS() protection Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2b13d9679f57..c1442bea1b5c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3568,10 +3568,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("LastCB"), -1, + rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence); } } @@ -3583,14 +3584,15 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { atomic_inc(&rsp->barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1, + rsp->barrier_sequence); } } @@ -3604,14 +3606,15 @@ static void _rcu_barrier(struct rcu_state *rsp) struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, s); + _rcu_barrier_trace(rsp, TPS("Begin"), -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); /* Did someone else do our work for us? */ if (rcu_seq_done(&rsp->barrier_sequence, s)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1, + rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; @@ -3619,7 +3622,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Mark the start of the barrier operation. */ rcu_seq_start(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3642,10 +3645,10 @@ static void _rcu_barrier(struct rcu_state *rsp) rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { - _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu, rsp->barrier_sequence); } else { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu, rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); @@ -3653,11 +3656,11 @@ static void _rcu_barrier(struct rcu_state *rsp) rcu_barrier_callback, rsp, cpu, 0); } } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { - _rcu_barrier_trace(rsp, "OnlineQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu, rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { - _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu, rsp->barrier_sequence); } } @@ -3674,7 +3677,7 @@ static void _rcu_barrier(struct rcu_state *rsp) wait_for_completion(&rsp->barrier_completion); /* Mark the end of the barrier operation. */ - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence); rcu_seq_end(&rsp->barrier_sequence); /* Other rcu_barrier() invocations can now safely proceed. */ -- cgit v1.2.3 From 35fe723bda12c25f4ac20a4fb91e345cacf568f7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 27 Jun 2017 17:41:25 -0700 Subject: rcu/tracing: Set disable_rcu_irq_enter on rcu_eqs_exit() Set disable_rcu_irq_enter on not only rcu_eqs_enter_common() but also rcu_eqs_exit(), since rcu_eqs_exit() suffers from the same issue as was fixed for rcu_eqs_enter_common() by commit 03ecd3f48e57 ("rcu/tracing: Add rcu_disabled to denote when rcu_irq_enter() will not work"). Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c1442bea1b5c..2b37f1a8e235 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -955,8 +955,10 @@ static void rcu_eqs_exit(bool user) if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { + __this_cpu_inc(disable_rcu_irq_enter); rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_eqs_exit_common(oldval, user); + __this_cpu_dec(disable_rcu_irq_enter); } } -- cgit v1.2.3 From 2dee9404fa8c4384453a5f3a15ad74ab9480f2d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Jul 2017 21:52:31 -0700 Subject: rcu: Add assertions verifying blocked-tasks list This commit adds assertions verifying the consistency of the rcu_node structure's ->blkd_tasks list and its ->gp_tasks, ->exp_tasks, and ->boost_tasks pointers. In particular, the ->blkd_tasks lists must be empty except for leaf rcu_node structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree_plugin.h | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2b37f1a8e235..ac2617d857a3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2410,6 +2410,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, mask, rnp->qsmask, rnp->level, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3e3f92e981a1..eadf8b95b5e9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) struct task_struct *t = current; lockdep_assert_held(&rnp->lock); + WARN_ON_ONCE(rdp->mynode != rnp); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); /* * Decide where to queue the newly blocked task. In theory, @@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != + !(rnp->qsmask & rdp->grpmask)); + WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != + !(rnp->expmask & rdp->grpmask)); raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* @@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t) if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; } /* -- cgit v1.2.3 From 3a60799269daff5ed254a9b473a8db6f0f5c6bd9 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Wed, 12 Jul 2017 07:59:54 -0700 Subject: rcu: Make rcu_idle_enter() rely on callers disabling irqs All callers to rcu_idle_enter() have irqs disabled, so there is no point in rcu_idle_enter disabling them again. This commit therefore replaces the irq disabling with a RCU_LOCKDEP_WARN(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ac2617d857a3..76f88b65961f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -843,11 +843,8 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - unsigned long flags; - - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); rcu_eqs_enter(false); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); -- cgit v1.2.3 From d4db30af51eb94ccfd0037e114c1c12cc299a3f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jul 2017 09:03:35 -0700 Subject: rcu: Add warning to rcu_idle_enter() for irqs enabled All current callers of rcu_idle_enter() have irqs disabled, and rcu_idle_enter() relies on this, but doesn't check. This commit therefore adds a RCU_LOCKDEP_WARN() to add some verification to the trust. While we are there, pass "true" rather than "1" to rcu_eqs_enter(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 76f88b65961f..5f4eccac1701 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -859,7 +859,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { - rcu_eqs_enter(1); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); + rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ -- cgit v1.2.3 From 16c0b106070f4760f7d1fffb0cd40393552b5294 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jul 2017 11:55:21 -0700 Subject: rcu: Remove exports from rcu_idle_exit() and rcu_idle_enter() The rcu_idle_exit() and rcu_idle_enter() functions are exported because they were originally used by RCU_NONIDLE(), which was intended to be usable from modules. However, RCU_NONIDLE() now instead uses rcu_irq_enter_irqson() and rcu_irq_exit_irqson(), which are not exported, and there have been no complaints. This commit therefore removes the exports from rcu_idle_exit() and rcu_idle_enter(). Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5f4eccac1701..7938754faca8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -846,7 +846,6 @@ void rcu_idle_enter(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); rcu_eqs_enter(false); } -EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL /** @@ -979,7 +978,6 @@ void rcu_idle_exit(void) rcu_eqs_exit(false); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** -- cgit v1.2.3 From 22e4ebb975822833b083533035233d128b30e98f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 28 Jul 2017 16:40:40 -0400 Subject: membarrier: Provide expedited private command Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built from all runqueues for which current thread's mm is the same as the thread calling sys_membarrier. It executes faster than the non-expedited variant (no blocking). It also works on NOHZ_FULL configurations. Scheduler-wise, it requires a memory barrier before and after context switching between processes (which have different mm). The memory barrier before context switch is already present. For the barrier after context switch: * Our TSO archs can do RELEASE without being a full barrier. Look at x86 spin_unlock() being a regular STORE for example. But for those archs, all atomics imply smp_mb and all of them have atomic ops in switch_mm() for mm_cpumask(), and on x86 the CR3 load acts as a full barrier. * From all weakly ordered machines, only ARM64 and PPC can do RELEASE, the rest does indeed do smp_mb(), so there the spin_unlock() is a full barrier and we're good. * ARM64 has a very heavy barrier in switch_to(), which suffices. * PPC just removed its barrier from switch_to(), but appears to be talking about adding something to switch_mm(). So add a smp_mb__after_unlock_lock() for now, until this is settled on the PPC side. Changes since v3: - Properly document the memory barriers provided by each architecture. Changes since v2: - Address comments from Peter Zijlstra, - Add smp_mb__after_unlock_lock() after finish_lock_switch() in finish_task_switch() to add the memory barrier we need after storing to rq->curr. This is much simpler than the previous approach relying on atomic_dec_and_test() in mmdrop(), which actually added a memory barrier in the common case of switching between userspace processes. - Return -EINVAL when MEMBARRIER_CMD_SHARED is used on a nohz_full kernel, rather than having the whole membarrier system call returning -ENOSYS. Indeed, CMD_PRIVATE_EXPEDITED is compatible with nohz_full. Adapt the CMD_QUERY mask accordingly. Changes since v1: - move membarrier code under kernel/sched/ because it uses the scheduler runqueue, - only add the barrier when we switch from a kernel thread. The case where we switch from a user-space thread is already handled by the atomic_dec_and_test() in mmdrop(). - add a comment to mmdrop() documenting the requirement on the implicit memory barrier. CC: Peter Zijlstra CC: Paul E. McKenney CC: Boqun Feng CC: Andrew Hunter CC: Maged Michael CC: gromer@google.com CC: Avi Kivity CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: Michael Ellerman Signed-off-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney Tested-by: Dave Watson --- kernel/Makefile | 1 - kernel/membarrier.c | 70 --------------------- kernel/sched/Makefile | 1 + kernel/sched/core.c | 25 ++++++++ kernel/sched/membarrier.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 178 insertions(+), 71 deletions(-) delete mode 100644 kernel/membarrier.c create mode 100644 kernel/sched/membarrier.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 4cb8e8b23c6e..9c323a6daa46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_HAS_IOMEM) += memremap.o diff --git a/kernel/membarrier.c b/kernel/membarrier.c deleted file mode 100644 index 9f9284f37f8d..000000000000 --- a/kernel/membarrier.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2010, 2015 Mathieu Desnoyers - * - * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include - -/* - * Bitmask made from a "or" of all commands within enum membarrier_cmd, - * except MEMBARRIER_CMD_QUERY. - */ -#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) - -/** - * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. - * - * If this system call is not implemented, -ENOSYS is returned. If the - * command specified does not exist, or if the command argument is invalid, - * this system call returns -EINVAL. For a given command, with flags argument - * set to 0, this system call is guaranteed to always return the same value - * until reboot. - * - * All memory accesses performed in program order from each targeted thread - * is guaranteed to be ordered with respect to sys_membarrier(). If we use - * the semantic "barrier()" to represent a compiler barrier forcing memory - * accesses to be performed in program order across the barrier, and - * smp_mb() to represent explicit memory barriers forcing full memory - * ordering across the barrier, we have the following ordering table for - * each pair of barrier(), sys_membarrier() and smp_mb(): - * - * The pair ordering is detailed as (O: ordered, X: not ordered): - * - * barrier() smp_mb() sys_membarrier() - * barrier() X X O - * smp_mb() X O O - * sys_membarrier() O O O - */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) -{ - /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ - if (tick_nohz_full_enabled()) - return -ENOSYS; - if (unlikely(flags)) - return -EINVAL; - switch (cmd) { - case MEMBARRIER_CMD_QUERY: - return MEMBARRIER_CMD_BITMASK; - case MEMBARRIER_CMD_SHARED: - if (num_online_cpus() > 1) - synchronize_sched(); - return 0; - default: - return -EINVAL; - } -} diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 53f0164ed362..78f54932ea1d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bfee6ea7db49..f77269c6c2f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2640,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + /* + * The membarrier system call requires a full memory barrier + * after storing to rq->curr, before going back to user-space. + * + * TODO: This smp_mb__after_unlock_lock can go away if PPC end + * up adding a full barrier to switch_mm(), or we should figure + * out if a smp_mb__after_unlock_lock is really the proper API + * to use. + */ + smp_mb__after_unlock_lock(); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -3329,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; + /* + * The membarrier system call requires each architecture + * to have a full memory barrier after updating + * rq->curr, before returning to user-space. For TSO + * (e.g. x86), the architecture must provide its own + * barrier in switch_mm(). For weakly ordered machines + * for which spin_unlock() acts as a full memory + * barrier, finish_lock_switch() in common code takes + * care of this barrier. For weakly ordered machines for + * which spin_unlock() acts as a RELEASE barrier (only + * arm64 and PowerPC), arm64 has a full barrier in + * switch_to(), and PowerPC has + * smp_mb__after_unlock_lock() before + * finish_lock_switch(). + */ ++*switch_count; trace_sched_switch(preempt, prev, next); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c new file mode 100644 index 000000000000..a92fddc22747 --- /dev/null +++ b/kernel/sched/membarrier.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2010-2017 Mathieu Desnoyers + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include + +#include "sched.h" /* for cpu_rq(). */ + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + +static void ipi_mb(void *info) +{ + smp_mb(); /* IPIs should be serializing but paranoid. */ +} + +static void membarrier_private_expedited(void) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (num_online_cpus() == 1) + return; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm == current->mm) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ +} + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, not available on the running + * kernel, or if the command argument is invalid, this system call + * returns -EINVAL. For a given command, with flags argument set to 0, + * this system call is guaranteed to always return the same value until + * reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + { + int cmd_mask = MEMBARRIER_CMD_BITMASK; + + if (tick_nohz_full_enabled()) + cmd_mask &= ~MEMBARRIER_CMD_SHARED; + return cmd_mask; + } + case MEMBARRIER_CMD_SHARED: + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -EINVAL; + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + case MEMBARRIER_CMD_PRIVATE_EXPEDITED: + membarrier_private_expedited(); + return 0; + default: + return -EINVAL; + } +} -- cgit v1.2.3 From dec13c42d21a96adc5d0d25510d0b59775dfd2ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jun 2017 12:47:04 -0700 Subject: completion: Replace spin_unlock_wait() with lock/unlock pair There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore replaces the spin_unlock_wait() call in completion_done() with spin_lock() followed immediately by spin_unlock(). This should be safe from a performance perspective because the lock will be held only the wakeup happens really quickly. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Alan Stern Cc: Andrea Parri Cc: Linus Torvalds Reviewed-by: Steven Rostedt (VMware) --- kernel/sched/completion.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae9bf2f..c9524d2d9316 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -300,6 +300,8 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; + if (!READ_ONCE(x->done)) return false; @@ -307,14 +309,9 @@ bool completion_done(struct completion *x) * If ->done, we need to wait for complete() to release ->wait.lock * otherwise we can end up freeing the completion before complete() * is done referencing it. - * - * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders - * the loads of ->done and ->wait.lock such that we cannot observe - * the lock before complete() acquires it while observing the ->done - * after it's acquired the lock. */ - smp_rmb(); - spin_unlock_wait(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); + spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); -- cgit v1.2.3 From 8083f29349372d5b949dc022ae9a981edc89ac41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jun 2017 12:55:21 -0700 Subject: exit: Replace spin_unlock_wait() with lock/unlock pair There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore replaces the spin_unlock_wait() call in do_exit() with spin_lock() followed immediately by spin_unlock(). This should be safe from a performance perspective because the lock is a per-task lock, and this is happening only at task-exit time. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Alan Stern Cc: Andrea Parri Cc: Linus Torvalds --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..abfbcf66e5c0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -819,7 +819,8 @@ void __noreturn do_exit(long code) * Ensure that we must observe the pi_state in exit_mm() -> * mm_release() -> exit_pi_state_list(). */ - raw_spin_unlock_wait(&tsk->pi_lock); + raw_spin_lock_irq(&tsk->pi_lock); + raw_spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", -- cgit v1.2.3 From d3a024abbc438277851c510b51ec9b158821488b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jun 2017 15:47:44 -0700 Subject: locking: Remove spin_unlock_wait() generic definitions There is no agreed-upon definition of spin_unlock_wait()'s semantics, and it appears that all callers could do just as well with a lock/unlock pair. This commit therefore removes spin_unlock_wait() and related definitions from core code. Signed-off-by: Paul E. McKenney Cc: Arnd Bergmann Cc: Ingo Molnar Cc: Will Deacon Cc: Peter Zijlstra Cc: Alan Stern Cc: Andrea Parri Cc: Linus Torvalds --- kernel/locking/qspinlock.c | 117 --------------------------------------------- 1 file changed, 117 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index fd24153e8a48..294294c71ba4 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -268,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif -/* - * Various notes on spin_is_locked() and spin_unlock_wait(), which are - * 'interesting' functions: - * - * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE - * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, - * PPC). Also qspinlock has a similar issue per construction, the setting of - * the locked byte can be unordered acquiring the lock proper. - * - * This gets to be 'interesting' in the following cases, where the /should/s - * end up false because of this issue. - * - * - * CASE 1: - * - * So the spin_is_locked() correctness issue comes from something like: - * - * CPU0 CPU1 - * - * global_lock(); local_lock(i) - * spin_lock(&G) spin_lock(&L[i]) - * for (i) if (!spin_is_locked(&G)) { - * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); - * return; - * } - * // deal with fail - * - * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such - * that there is exclusion between the two critical sections. - * - * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from - * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) - * /should/ be constrained by the ACQUIRE from spin_lock(&G). - * - * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. - * - * - * CASE 2: - * - * For spin_unlock_wait() there is a second correctness issue, namely: - * - * CPU0 CPU1 - * - * flag = set; - * smp_mb(); spin_lock(&l) - * spin_unlock_wait(&l); if (!flag) - * // add to lockless list - * spin_unlock(&l); - * // iterate lockless list - * - * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 - * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE - * semantics etc..) - * - * Where flag /should/ be ordered against the locked store of l. - */ - -/* - * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before - * issuing an _unordered_ store to set _Q_LOCKED_VAL. - * - * This means that the store can be delayed, but no later than the - * store-release from the unlock. This means that simply observing - * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. - * - * There are two paths that can issue the unordered store: - * - * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 - * - * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 - * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 - * - * However, in both cases we have other !0 state we've set before to queue - * ourseves: - * - * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our - * load is constrained by that ACQUIRE to not pass before that, and thus must - * observe the store. - * - * For (2) we have a more intersting scenario. We enqueue ourselves using - * xchg_tail(), which ends up being a RELEASE. This in itself is not - * sufficient, however that is followed by an smp_cond_acquire() on the same - * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and - * guarantees we must observe that store. - * - * Therefore both cases have other !0 state that is observable before the - * unordered locked byte store comes through. This means we can use that to - * wait for the lock store, and then wait for an unlock. - */ -#ifndef queued_spin_unlock_wait -void queued_spin_unlock_wait(struct qspinlock *lock) -{ - u32 val; - - for (;;) { - val = atomic_read(&lock->val); - - if (!val) /* not locked, we're done */ - goto done; - - if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ - break; - - /* not locked, but pending, wait until we observe the lock */ - cpu_relax(); - } - - /* any unlock is good */ - while (atomic_read(&lock->val) & _Q_LOCKED_MASK) - cpu_relax(); - -done: - smp_acquire__after_ctrl_dep(); -} -EXPORT_SYMBOL(queued_spin_unlock_wait); -#endif - #endif /* _GEN_PV_LOCK_SLOWPATH */ /** -- cgit v1.2.3 From 1ab2de2bfed3ab2073ed1e7afa2b1134930d2b70 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Aug 2017 14:59:40 +0200 Subject: bpf: fix liveness propagation to parent in spilled stack slots Using parent->regs[] when propagating REG_LIVE_READ for spilled regs doesn't work since parent->regs[] denote the set of normal registers but not spilled ones. Propagate to the correct regs. Fixes: dc503a8ad984 ("bpf/verifier: track liveness for pruning") Reported-by: Dan Carpenter Signed-off-by: Daniel Borkmann Acked-by: Edward Cree Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 958ba84a9995..40f669ddb571 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3460,7 +3460,7 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state, if (parent->spilled_regs[i].live & REG_LIVE_READ) continue; if (state->spilled_regs[i].live == REG_LIVE_READ) { - parent->regs[i].live |= REG_LIVE_READ; + parent->spilled_regs[i].live |= REG_LIVE_READ; touched = true; } } -- cgit v1.2.3 From 976d28bfd1f62a3f8e5370c5e7127ff5b3499359 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Aug 2017 17:22:36 +0200 Subject: bpf: don't enable preemption twice in smap_do_verdict In smap_do_verdict(), the fall-through branch leads to call preempt_enable() twice for the SK_REDIRECT, which creates an imbalance. Only enable it for all remaining cases again. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index f7e5e6cf124a..39de541fbcdc 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -135,7 +135,8 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) /* Fall through and free skb otherwise */ case SK_DROP: default: - preempt_enable(); + if (rc != SK_REDIRECT) + preempt_enable(); kfree_skb(skb); } } -- cgit v1.2.3 From a529bea8fa6b6dded6179c72d3385e0f7d0a4fde Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Wed, 28 Jun 2017 22:21:35 +0900 Subject: timekeeping: Use proper timekeeper for debug code When CONFIG_DEBUG_TIMEKEEPING is enabled the timekeeping_check_update() function will update status like last_warning and underflow_seen on the timekeeper. If there are issues found this state is used to rate limit the warnings that get printed. This rate limiting doesn't really really work if stored in real_tk as the shadow timekeeper is overwritten onto real_tk at the end of every update_wall_time() call, resetting last_warning and other statuses. Fix rate limiting by using the shadow_timekeeper for timekeeping_check_update(). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Fixes: commit 57d05a93ada7 ("time: Rework debugging variables so they aren't global") Signed-off-by: Stafford Horne Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cedafa008de5..8f5866981883 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2066,7 +2066,7 @@ void update_wall_time(void) goto out; /* Do some additional sanity checking */ - timekeeping_check_update(real_tk, offset); + timekeeping_check_update(tk, offset); /* * With NO_HZ we may have to accumulate many cycle_intervals -- cgit v1.2.3 From 47b4a457e4cc816b3fdd2ee55c65fda8ea6de051 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Jul 2017 14:08:35 +0200 Subject: alarmtimer: Fix unavailable wake-up source in sysfs Currently the alarmtimer registers a wake-up source unconditionally, regardless of the system having a (wake-up capable) RTC or not. Hence the alarmtimer will always show up in /sys/kernel/debug/wakeup_sources, even if it is not available, and thus cannot be a wake-up source. To fix this, postpone registration until a wake-up capable RTC device is added. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Geert Uytterhoeven Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0b8ff7d257ea..73a2b476e59f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -56,9 +56,9 @@ static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); #endif +#ifdef CONFIG_RTC_CLASS static struct wakeup_source *ws; -#ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; @@ -89,6 +89,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, { unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); + struct wakeup_source *__ws; if (rtcdev) return -EBUSY; @@ -98,13 +99,20 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (!device_may_wakeup(rtc->dev.parent)) return -1; + __ws = wakeup_source_register("alarmtimer"); + spin_lock_irqsave(&rtcdev_lock, flags); if (!rtcdev) { rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); + ws = __ws; + __ws = NULL; } spin_unlock_irqrestore(&rtcdev_lock, flags); + + wakeup_source_unregister(__ws); + return 0; } @@ -860,7 +868,6 @@ static int __init alarmtimer_init(void) error = PTR_ERR(pdev); goto out_drv; } - ws = wakeup_source_register("alarmtimer"); return 0; out_drv: -- cgit v1.2.3 From c71b02e4d207cbcf097f9746d5f7967b22905e70 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 9 Aug 2017 21:11:00 -0700 Subject: Revert "pstore: Honor dmesg_restrict sysctl on dmesg dumps" This reverts commit 68c4a4f8abc60c9440ede9cd123d48b78325f7a3, with various conflict clean-ups. The capability check required too much privilege compared to simple DAC controls. A system builder was forced to have crash handler processes run with CAP_SYSLOG which would give it the ability to read (and wipe) the _current_ dmesg, which is much more access than being given access only to the historical log stored in pstorefs. With the prior commit to make the root directory 0750, the files are protected by default but a system builder can now opt to give access to a specific group (via chgrp on the pstorefs root directory) without being forced to also give away CAP_SYSLOG. Suggested-by: Nick Kralevich Signed-off-by: Kees Cook Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky --- kernel/printk/printk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fc47863f629c..97bda7b0655b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -649,7 +649,7 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -int check_syslog_permissions(int type, int source) +static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've @@ -677,7 +677,6 @@ int check_syslog_permissions(int type, int source) ok: return security_syslog(type); } -EXPORT_SYMBOL_GPL(check_syslog_permissions); static void append_char(char **pp, char *e, char c) { -- cgit v1.2.3 From e2cabe48c20efb174ce0c01190f8b9c5f3ea1d13 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 10 Aug 2017 09:50:55 +0530 Subject: cpufreq: schedutil: Don't restrict kthread to related_cpus unnecessarily Utilization update callbacks are now processed remotely, even on the CPUs that don't share cpufreq policy with the target CPU (if dvfs_possible_from_any_cpu flag is set). But in non-fast switch paths, the frequency is changed only from one of policy->related_cpus. This happens because the kthread which does the actual update is bound to a subset of CPUs (i.e. related_cpus). Allow frequency to be remotely updated as well (i.e. call __cpufreq_driver_target()) if dvfs_possible_from_any_cpu flag is set. Reported-by: Pavan Kondeti Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2ba04bb3182a..69571ee6a175 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -487,7 +487,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + + /* Kthread is bound to all CPUs by default */ + if (!policy->dvfs_possible_from_any_cpu) + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); -- cgit v1.2.3 From c49cbc19b31e069cb344921c7286d7549767d10e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 14 Aug 2017 14:50:16 +0530 Subject: cpufreq: schedutil: Always process remote callback with slow switching The frequency update from the utilization update handlers can be divided into two parts: (A) Finding the next frequency (B) Updating the frequency While any CPU can do (A), (B) can be restricted to a group of CPUs only, depending on the current platform. For platforms where fast cpufreq switching is possible, both (A) and (B) are always done from the same CPU and that CPU should be capable of changing the frequency of the target CPU. But for platforms where fast cpufreq switching isn't possible, after doing (A) we wake up a kthread which will eventually do (B). This kthread is already bound to the right set of CPUs, i.e. only those which can change the frequency of CPUs of a cpufreq policy. And so any CPU can actually do (A) in this case, as the frequency is updated from the right set of CPUs only. Check cpufreq_can_do_remote_dvfs() only for the fast switching case. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 69571ee6a175..a07f17a5f38f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -84,13 +84,18 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) * * However, drivers cannot in general deal with cross-cpu * requests, so while get_next_freq() will work, our - * sugov_update_commit() call may not. + * sugov_update_commit() call may not for the fast switching platforms. * * Hence stop here for remote requests if they aren't supported * by the hardware, as calculating the frequency is pointless if * we cannot in fact act on it. + * + * For the slow switching platforms, the kthread is always scheduled on + * the right set of CPUs and any CPU can find the next frequency and + * schedule the kthread. */ - if (!cpufreq_can_do_remote_dvfs(sg_policy->policy)) + if (sg_policy->policy->fast_switch_enabled && + !cpufreq_can_do_remote_dvfs(sg_policy->policy)) return false; if (sg_policy->work_in_progress) -- cgit v1.2.3 From 726fb6b4f2a82a14a906f39bdabac4863b87c01a Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 15 Aug 2017 18:16:59 -0700 Subject: ACPI / PM: Check low power idle constraints for debug only For SoC to achieve its lowest power platform idle state a set of hardware preconditions must be met. These preconditions or constraints can be obtained by issuing a device specific method (_DSM) with function "1". Refer to the document provided in the link below. Here during initialization (from attach() callback of LPS0 device), invoke function 1 to get the device constraints. Each enabled constraint is stored in a table. The devices in this table are used to check whether they were in required minimum state, while entering suspend. This check is done from platform freeze wake() callback, only when /sys/power/pm_debug_messages attribute is non zero. If any constraint is not met and device is ACPI power managed then it prints the device information to kernel logs. Also if debug is enabled in acpi/sleep.c, the constraint table and state of each device on wake is dumped in kernel logs. Since pm_debug_messages_on setting is used as condition to check constraints outside kernel/power/main.c, pm_debug_messages_on is changed to a global variable. Link: http://www.uefi.org/sites/default/files/resources/Intel_ACPI_Low_Power_S0_Idle.pdf Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 3074ea4cec0a..3a2ca9066583 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -361,7 +361,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, power_attr_ro(pm_wakeup_irq); -static bool pm_debug_messages_on __read_mostly; +bool pm_debug_messages_on __read_mostly; static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -- cgit v1.2.3 From 536e2e34bd002267384b0668ffff3f023003a830 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Aug 2017 09:11:56 +0100 Subject: genirq/debugfs: Triggering of interrupts from userspace When developing new (and therefore buggy) interrupt related code, it can sometimes be useful to inject interrupts without having to rely on a device to actually generate them. This functionnality relies either on the irqchip driver to expose a irq_set_irqchip_state(IRQCHIP_STATE_PENDING) callback, or on the core code to be able to retrigger a (edge-only) interrupt. To use this feature: echo -n trigger > /sys/kernel/debug/irq/irqs/IRQNUM WARNING: This is DANGEROUS, and strictly a debug feature. Do not use it on a production system. Your HW is likely to catch fire, your data to be corrupted, and reporting this will make you look an even bigger fool than the idiot who wrote this patch. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20170818081156.9264-1-marc.zyngier@arm.com --- kernel/irq/debugfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 4d384edc0c64..c3fdb36dec30 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -5,6 +5,7 @@ */ #include #include +#include #include "internals.h" @@ -171,8 +172,55 @@ static int irq_debug_open(struct inode *inode, struct file *file) return single_open(file, irq_debug_show, inode->i_private); } +static ssize_t irq_debug_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct irq_desc *desc = file_inode(file)->i_private; + char buf[8] = { 0, }; + size_t size; + + size = min(sizeof(buf) - 1, count); + if (copy_from_user(buf, user_buf, size)) + return -EFAULT; + + if (!strncmp(buf, "trigger", size)) { + unsigned long flags; + int err; + + /* Try the HW interface first */ + err = irq_set_irqchip_state(irq_desc_get_irq(desc), + IRQCHIP_STATE_PENDING, true); + if (!err) + return count; + + /* + * Otherwise, try to inject via the resend interface, + * which may or may not succeed. + */ + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, flags); + + if (irq_settings_is_level(desc)) { + /* Can't do level, sorry */ + err = -EINVAL; + } else { + desc->istate |= IRQS_PENDING; + check_irq_resend(desc); + err = 0; + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); + + return err ? err : count; + } + + return count; +} + static const struct file_operations dfs_irq_ops = { .open = irq_debug_open, + .write = irq_debug_write, .read = seq_read, .llseek = seq_lseek, .release = single_release, @@ -186,7 +234,7 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) return; sprintf(name, "%d", irq); - desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc, + desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc, &dfs_irq_ops); } -- cgit v1.2.3 From 6bc6d4abd22e890cf69a05554fa8f8f83f351515 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Aug 2017 09:39:15 +0100 Subject: genirq/proc: Use the the accessor to report the effective affinity If CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK is defined, but that the interrupt is not single target, the effective affinity reported in /proc/irq/x/effective_affinity will be empty, which is not the truth. Instead, use the accessor to report the affinity, which will pick the right mask. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Andrew Lunn Cc: James Hogan Cc: Jason Cooper Cc: Paul Burton Cc: Chris Zankel Cc: Kevin Cernekee Cc: Wei Xu Cc: Max Filippov Cc: Florian Fainelli Cc: Gregory Clement Cc: Matt Redfearn Cc: Sebastian Hesselbarth Link: http://lkml.kernel.org/r/20170818083925.10108-3-marc.zyngier@arm.com --- kernel/irq/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 7f9642a1e267..0534781724d0 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -61,7 +61,7 @@ static int show_irq_affinity(int type, struct seq_file *m) case EFFECTIVE: case EFFECTIVE_LIST: #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK - mask = desc->irq_common_data.effective_affinity; + mask = irq_data_get_effective_affinity_mask(&desc->irq_data); break; #else return -EINVAL; -- cgit v1.2.3 From 65efd9a49af8174b2283fd5b27e9edf30e4483d0 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 17 Aug 2017 17:53:30 -0700 Subject: genirq: Export more irq_chip_*_parent() functions Many of the family of functions including irq_chip_mask_parent(), irq_chip_unmask_parent() are exported, but not all. Add EXPORT_SYMBOL_GPL to irq_chip_enable_parent, irq_chip_disable_parent and irq_chip_set_affinity_parent, so they likewise are usable from modules. Signed-off-by: David Daney Signed-off-by: Thomas Gleixner Cc: Mark Rutland Cc: Alexandre Courbot Cc: Marc Zyngier Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/1503017616-3252-2-git-send-email-david.daney@cavium.com --- kernel/irq/chip.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3cc37c0c85e..6514f07acaad 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1105,6 +1105,7 @@ void irq_chip_enable_parent(struct irq_data *data) else data->chip->irq_unmask(data); } +EXPORT_SYMBOL_GPL(irq_chip_enable_parent); /** * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if @@ -1119,6 +1120,7 @@ void irq_chip_disable_parent(struct irq_data *data) else data->chip->irq_mask(data); } +EXPORT_SYMBOL_GPL(irq_chip_disable_parent); /** * irq_chip_ack_parent - Acknowledge the parent interrupt @@ -1181,6 +1183,7 @@ int irq_chip_set_affinity_parent(struct irq_data *data, return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_affinity_parent); /** * irq_chip_set_type_parent - Set IRQ type on the parent interrupt -- cgit v1.2.3 From 7703b08cc93b3586f9eb733f3a2b10bed634a5cf Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 17 Aug 2017 17:53:31 -0700 Subject: genirq: Add handle_fasteoi_{level,edge}_irq flow handlers Follow-on patch for gpio-thunderx uses a irqdomain hierarchy which requires slightly different flow handlers, add them to chip.c which contains most of the other flow handlers. Make these conditionally compiled based on CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS. Signed-off-by: David Daney Signed-off-by: Thomas Gleixner Cc: Mark Rutland Cc: Alexandre Courbot Cc: Marc Zyngier Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/1503017616-3252-3-git-send-email-david.daney@cavium.com --- kernel/irq/Kconfig | 4 ++ kernel/irq/chip.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1d06af787932..a117adf7084b 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -73,6 +73,10 @@ config IRQ_DOMAIN_HIERARCHY bool select IRQ_DOMAIN +# Support for hierarchical fasteoi+edge and fasteoi+level handlers +config IRQ_FASTEOI_HIERARCHY_HANDLERS + bool + # Generic IRQ IPI support config GENERIC_IRQ_IPI bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6514f07acaad..23958980189d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1092,6 +1092,112 @@ void irq_cpu_offline(void) } #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + +#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS +/** + * handle_fasteoi_ack_irq - irq handler for edge hierarchy + * stacked on transparent controllers + * + * @desc: the interrupt description structure for this irq + * + * Like handle_fasteoi_irq(), but for use with hierarchy where + * the irq_chip also needs to have its ->irq_ack() function + * called. + */ +void handle_fasteoi_ack_irq(struct irq_desc *desc) +{ + struct irq_chip *chip = desc->irq_data.chip; + + raw_spin_lock(&desc->lock); + + if (!irq_may_run(desc)) + goto out; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + /* + * If its disabled or no action available + * then mask it and get out of here: + */ + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + mask_irq(desc); + goto out; + } + + kstat_incr_irqs_this_cpu(desc); + if (desc->istate & IRQS_ONESHOT) + mask_irq(desc); + + /* Start handling the irq */ + desc->irq_data.chip->irq_ack(&desc->irq_data); + + preflow_handler(desc); + handle_irq_event(desc); + + cond_unmask_eoi_irq(desc, chip); + + raw_spin_unlock(&desc->lock); + return; +out: + if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq); + +/** + * handle_fasteoi_mask_irq - irq handler for level hierarchy + * stacked on transparent controllers + * + * @desc: the interrupt description structure for this irq + * + * Like handle_fasteoi_irq(), but for use with hierarchy where + * the irq_chip also needs to have its ->irq_mask_ack() function + * called. + */ +void handle_fasteoi_mask_irq(struct irq_desc *desc) +{ + struct irq_chip *chip = desc->irq_data.chip; + + raw_spin_lock(&desc->lock); + mask_ack_irq(desc); + + if (!irq_may_run(desc)) + goto out; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + /* + * If its disabled or no action available + * then mask it and get out of here: + */ + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + mask_irq(desc); + goto out; + } + + kstat_incr_irqs_this_cpu(desc); + if (desc->istate & IRQS_ONESHOT) + mask_irq(desc); + + preflow_handler(desc); + handle_irq_event(desc); + + cond_unmask_eoi_irq(desc, chip); + + raw_spin_unlock(&desc->lock); + return; +out: + if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); + +#endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */ + /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) -- cgit v1.2.3 From b526adfe1b0531fceba44b18c156e4edf9c6205c Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 17 Aug 2017 17:53:32 -0700 Subject: irqdomain: Factor out code to add and remove items to and from the revmap The code to add and remove items to and from the revmap occurs several times. In preparation for the follow on patches that add more uses of this code, factor this out in to separate static functions. Signed-off-by: David Daney Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Mark Rutland Cc: Alexandre Courbot Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/1503017616-3252-4-git-send-email-david.daney@cavium.com --- kernel/irq/irqdomain.c | 58 +++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f1f251479aa6..2093b88ce9b7 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -455,6 +455,31 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); +static void irq_domain_clear_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = 0; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + } +} + +static void irq_domain_set_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq, + struct irq_data *irq_data) +{ + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = irq_data->irq; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); + } +} + void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); @@ -483,13 +508,7 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) domain->mapcount--; /* Clear reverse map for this hwirq */ - if (hwirq < domain->revmap_size) { - domain->linear_revmap[hwirq] = 0; - } else { - mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_tree, hwirq); - mutex_unlock(&revmap_trees_mutex); - } + irq_domain_clear_mapping(domain, hwirq); } int irq_domain_associate(struct irq_domain *domain, unsigned int virq, @@ -533,13 +552,7 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, } domain->mapcount++; - if (hwirq < domain->revmap_size) { - domain->linear_revmap[hwirq] = virq; - } else { - mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); - } + irq_domain_set_mapping(domain, hwirq, irq_data); mutex_unlock(&irq_domain_mutex); irq_clear_status_flags(virq, IRQ_NOREQUEST); @@ -1138,16 +1151,9 @@ static void irq_domain_insert_irq(int virq) for (data = irq_get_irq_data(virq); data; data = data->parent_data) { struct irq_domain *domain = data->domain; - irq_hw_number_t hwirq = data->hwirq; domain->mapcount++; - if (hwirq < domain->revmap_size) { - domain->linear_revmap[hwirq] = virq; - } else { - mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_tree, hwirq, data); - mutex_unlock(&revmap_trees_mutex); - } + irq_domain_set_mapping(domain, data->hwirq, data); /* If not already assigned, give the domain the chip's name */ if (!domain->name && data->chip) @@ -1171,13 +1177,7 @@ static void irq_domain_remove_irq(int virq) irq_hw_number_t hwirq = data->hwirq; domain->mapcount--; - if (hwirq < domain->revmap_size) { - domain->linear_revmap[hwirq] = 0; - } else { - mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_tree, hwirq); - mutex_unlock(&revmap_trees_mutex); - } + irq_domain_clear_mapping(domain, hwirq); } } -- cgit v1.2.3 From 0d12ec075a18f53e6f58ec95a4f534da2641bf9b Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 17 Aug 2017 17:53:33 -0700 Subject: irqdomain: Check for NULL function pointer in irq_domain_free_irqs_hierarchy() A follow-on patch will call irq_domain_free_irqs_hierarchy() when the free() function pointer may be NULL. Add a NULL pointer check to handle this new use case. Signed-off-by: David Daney Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Mark Rutland Cc: Alexandre Courbot Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/1503017616-3252-5-git-send-email-david.daney@cavium.com --- kernel/irq/irqdomain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 2093b88ce9b7..24fda7557cef 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1362,7 +1362,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { - domain->ops->free(domain, irq_base, nr_irqs); + if (domain->ops->free) + domain->ops->free(domain, irq_base, nr_irqs); } int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, -- cgit v1.2.3 From 495c38d3001fd226cf91df1d031320f349bcaf35 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 17 Aug 2017 17:53:34 -0700 Subject: irqdomain: Add irq_domain_{push,pop}_irq() functions For an already existing irqdomain hierarchy, as might be obtained via a call to pci_enable_msix_range(), a PCI driver wishing to add an additional irqdomain to the hierarchy needs to be able to insert the irqdomain to that already initialized hierarchy. Calling irq_domain_create_hierarchy() allows the new irqdomain to be created, but no existing code allows for initializing the associated irq_data. Add a couple of helper functions (irq_domain_push_irq() and irq_domain_pop_irq()) to initialize the irq_data for the new irqdomain added to an existing hierarchy. Signed-off-by: David Daney Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Mark Rutland Cc: Alexandre Courbot Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/1503017616-3252-6-git-send-email-david.daney@cavium.com --- kernel/irq/irqdomain.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 24fda7557cef..1ff9912211e9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1449,6 +1449,175 @@ out_free_desc: return ret; } +/* The irq_data was moved, fix the revmap to refer to the new location */ +static void irq_domain_fix_revmap(struct irq_data *d) +{ + void **slot; + + if (d->hwirq < d->domain->revmap_size) + return; /* Not using radix tree. */ + + /* Fix up the revmap. */ + mutex_lock(&revmap_trees_mutex); + slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq); + if (slot) + radix_tree_replace_slot(&d->domain->revmap_tree, slot, d); + mutex_unlock(&revmap_trees_mutex); +} + +/** + * irq_domain_push_irq() - Push a domain in to the top of a hierarchy. + * @domain: Domain to push. + * @virq: Irq to push the domain in to. + * @arg: Passed to the irq_domain_ops alloc() function. + * + * For an already existing irqdomain hierarchy, as might be obtained + * via a call to pci_enable_msix(), add an additional domain to the + * head of the processing chain. Must be called before request_irq() + * has been called. + */ +int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) +{ + struct irq_data *child_irq_data; + struct irq_data *root_irq_data = irq_get_irq_data(virq); + struct irq_desc *desc; + int rv = 0; + + /* + * Check that no action has been set, which indicates the virq + * is in a state where this function doesn't have to deal with + * races between interrupt handling and maintaining the + * hierarchy. This will catch gross misuse. Attempting to + * make the check race free would require holding locks across + * calls to struct irq_domain_ops->alloc(), which could lead + * to deadlock, so we just do a simple check before starting. + */ + desc = irq_to_desc(virq); + if (!desc) + return -EINVAL; + if (WARN_ON(desc->action)) + return -EBUSY; + + if (domain == NULL) + return -EINVAL; + + if (WARN_ON(!irq_domain_is_hierarchy(domain))) + return -EINVAL; + + if (domain->parent != root_irq_data->domain) + return -EINVAL; + + if (!root_irq_data) + return -EINVAL; + + child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL, + irq_data_get_node(root_irq_data)); + if (!child_irq_data) + return -ENOMEM; + + mutex_lock(&irq_domain_mutex); + + /* Copy the original irq_data. */ + *child_irq_data = *root_irq_data; + + /* + * Overwrite the root_irq_data, which is embedded in struct + * irq_desc, with values for this domain. + */ + root_irq_data->parent_data = child_irq_data; + root_irq_data->domain = domain; + root_irq_data->mask = 0; + root_irq_data->hwirq = 0; + root_irq_data->chip = NULL; + root_irq_data->chip_data = NULL; + + /* May (probably does) set hwirq, chip, etc. */ + rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); + if (rv) { + /* Restore the original irq_data. */ + *root_irq_data = *child_irq_data; + goto error; + } + + irq_domain_fix_revmap(child_irq_data); + irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data); + +error: + mutex_unlock(&irq_domain_mutex); + + return rv; +} +EXPORT_SYMBOL_GPL(irq_domain_push_irq); + +/** + * irq_domain_pop_irq() - Remove a domain from the top of a hierarchy. + * @domain: Domain to remove. + * @virq: Irq to remove the domain from. + * + * Undo the effects of a call to irq_domain_push_irq(). Must be + * called either before request_irq() or after free_irq(). + */ +int irq_domain_pop_irq(struct irq_domain *domain, int virq) +{ + struct irq_data *root_irq_data = irq_get_irq_data(virq); + struct irq_data *child_irq_data; + struct irq_data *tmp_irq_data; + struct irq_desc *desc; + + /* + * Check that no action is set, which indicates the virq is in + * a state where this function doesn't have to deal with races + * between interrupt handling and maintaining the hierarchy. + * This will catch gross misuse. Attempting to make the check + * race free would require holding locks across calls to + * struct irq_domain_ops->free(), which could lead to + * deadlock, so we just do a simple check before starting. + */ + desc = irq_to_desc(virq); + if (!desc) + return -EINVAL; + if (WARN_ON(desc->action)) + return -EBUSY; + + if (domain == NULL) + return -EINVAL; + + if (!root_irq_data) + return -EINVAL; + + tmp_irq_data = irq_domain_get_irq_data(domain, virq); + + /* We can only "pop" if this domain is at the top of the list */ + if (WARN_ON(root_irq_data != tmp_irq_data)) + return -EINVAL; + + if (WARN_ON(root_irq_data->domain != domain)) + return -EINVAL; + + child_irq_data = root_irq_data->parent_data; + if (WARN_ON(!child_irq_data)) + return -EINVAL; + + mutex_lock(&irq_domain_mutex); + + root_irq_data->parent_data = NULL; + + irq_domain_clear_mapping(domain, root_irq_data->hwirq); + irq_domain_free_irqs_hierarchy(domain, virq, 1); + + /* Restore the original irq_data. */ + *root_irq_data = *child_irq_data; + + irq_domain_fix_revmap(root_irq_data); + + mutex_unlock(&irq_domain_mutex); + + kfree(child_irq_data); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_pop_irq); + /** * irq_domain_free_irqs - Free IRQ number and associated data structures * @virq: base IRQ number -- cgit v1.2.3 From e8f241893dfbbebe2813c01eac54f263e6a5e59c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Aug 2017 10:53:45 +0100 Subject: genirq: Restore trigger settings in irq_modify_status() irq_modify_status starts by clearing the trigger settings from irq_data before applying the new settings, but doesn't restore them, leaving them to IRQ_TYPE_NONE. That's pretty confusing to the potential request_irq() that could follow. Instead, snapshot the settings before clearing them, and restore them if the irq_modify_status() invocation was not changing the trigger. Fixes: 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ") Reported-and-tested-by: jeffy Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jon Hunter Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170818095345.12378-1-marc.zyngier@arm.com --- kernel/irq/chip.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3cc37c0c85e..3675c6004f2a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags; + unsigned long flags, trigger, tmp; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) @@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); + trigger = irqd_get_trigger_type(&desc->irq_data); + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) @@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; + + irqd_set(&desc->irq_data, trigger); irq_put_desc_unlock(desc, flags); } -- cgit v1.2.3 From 7edaeb6841dfb27e362288ab8466ebdc4972e867 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Aug 2017 09:50:13 +0200 Subject: kernel/watchdog: Prevent false positives with turbo modes The hardlockup detector on x86 uses a performance counter based on unhalted CPU cycles and a periodic hrtimer. The hrtimer period is about 2/5 of the performance counter period, so the hrtimer should fire 2-3 times before the performance counter NMI fires. The NMI code checks whether the hrtimer fired since the last invocation. If not, it assumess a hard lockup. The calculation of those periods is based on the nominal CPU frequency. Turbo modes increase the CPU clock frequency and therefore shorten the period of the perf/NMI watchdog. With extreme Turbo-modes (3x nominal frequency) the perf/NMI period is shorter than the hrtimer period which leads to false positives. A simple fix would be to shorten the hrtimer period, but that comes with the side effect of more frequent hrtimer and softlockup thread wakeups, which is not desired. Implement a low pass filter, which checks the perf/NMI period against kernel time. If the perf/NMI fires before 4/5 of the watchdog period has elapsed then the event is ignored and postponed to the next perf/NMI. That solves the problem and avoids the overhead of shorter hrtimer periods and more frequent softlockup thread wakeups. Fixes: 58687acba592 ("lockup_detector: Combine nmi_watchdog and softlockup detector") Reported-and-tested-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: dzickus@redhat.com Cc: prarit@redhat.com Cc: ak@linux.intel.com Cc: babu.moger@oracle.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@redhat.com Cc: stable@vger.kernel.org Cc: atomlin@redhat.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708150931310.1886@nanos --- kernel/watchdog.c | 1 + kernel/watchdog_hld.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06d3389bca0d..f5d52024f6b7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -240,6 +240,7 @@ static void set_sample_period(void) * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + watchdog_update_hrtimer_threshold(sample_period); } /* Commands for resetting the watchdog */ diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 295a0d84934c..3a09ea1b1d3d 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void) } EXPORT_SYMBOL(arch_touch_nmi_watchdog); +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ + /* + * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 + * + * So it runs effectively with 2.5 times the rate of the NMI + * watchdog. That means the hrtimer should fire 2-3 times before + * the NMI watchdog expires. The NMI watchdog on x86 is based on + * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles + * might run way faster than expected and the NMI fires in a + * smaller period than the one deduced from the nominal CPU + * frequency. Depending on the Turbo-Mode factor this might be fast + * enough to get the NMI period smaller than the hrtimer watchdog + * period and trigger false positives. + * + * The sample threshold is used to check in the NMI handler whether + * the minimum time between two NMI samples has elapsed. That + * prevents false positives. + * + * Set this to 4/5 of the actual watchdog threshold period so the + * hrtimer is guaranteed to fire at least once within the real + * watchdog threshold. + */ + watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ + ktime_t delta, now = ktime_get_mono_fast_ns(); + + delta = now - __this_cpu_read(last_timestamp); + if (delta < watchdog_hrtimer_sample_threshold) { + /* + * If ktime is jiffies based, a stalled timer would prevent + * jiffies from being incremented and the filter would look + * at a stale timestamp and never trigger. + */ + if (__this_cpu_inc_return(nmi_rearmed) < 10) + return false; + } + __this_cpu_write(nmi_rearmed, 0); + __this_cpu_write(last_timestamp, now); + return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ + return true; +} +#endif + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } + if (!watchdog_check_timestamp()) + return; + /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have -- cgit v1.2.3 From 3cf294962df8fcde710eb5e762e0929e2ba49947 Mon Sep 17 00:00:00 2001 From: Krzysztof Opasiak Date: Wed, 5 Jul 2017 19:25:48 +0200 Subject: posix-cpu-timers: Use dedicated helper to access rlimit values Use rlimit() and rlimit_max() helper instead of manually writing whole chain from task to rlimit value Signed-off-by: Krzysztof Opasiak Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20170705172548.7911-1-k.opasiak@samsung.com --- kernel/time/posix-cpu-timers.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a3bd5dbe0dc4..8585ad6e472a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -799,7 +799,6 @@ static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { struct list_head *timers = tsk->cpu_timers; - struct signal_struct *const sig = tsk->signal; struct task_cputime *tsk_expires = &tsk->cputime_expires; u64 expires; unsigned long soft; @@ -823,10 +822,9 @@ static void check_thread_timers(struct task_struct *tsk, /* * Check for the special case thread timers. */ - soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + soft = task_rlimit(tsk, RLIMIT_RTTIME); if (soft != RLIM_INFINITY) { - unsigned long hard = - READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); if (hard != RLIM_INFINITY && tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { @@ -847,7 +845,8 @@ static void check_thread_timers(struct task_struct *tsk, */ if (soft < hard) { soft += USEC_PER_SEC; - sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; + tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = + soft; } if (print_fatal_signals) { pr_info("RT Watchdog Timeout (soft): %s[%d]\n", @@ -938,11 +937,10 @@ static void check_process_timers(struct task_struct *tsk, SIGPROF); check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, SIGVTALRM); - soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + soft = task_rlimit(tsk, RLIMIT_CPU); if (soft != RLIM_INFINITY) { unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); - unsigned long hard = - READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); u64 x; if (psecs >= hard) { /* -- cgit v1.2.3 From e1cba4b85daa71b710384d451ff6238d5e4d1ff6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 17 Aug 2017 15:33:09 -0400 Subject: cgroup: Add mount flag to enable cpuset to use v2 behavior in v1 cgroup A new mount option "cpuset_v2_mode" is added to the v1 cgroupfs filesystem to enable cpuset controller to use v2 behavior in a v1 cgroup. This mount option applies only to cpuset controller and have no effect on other controllers. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index f0e8601b13cb..024085daab1a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -895,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); + if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) + seq_puts(seq, ",cpuset_v2_mode"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) @@ -949,6 +951,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->cpuset_clone_children = true; continue; } + if (!strcmp(token, "cpuset_v2_mode")) { + opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; + continue; + } if (!strcmp(token, "xattr")) { opts->flags |= CGRP_ROOT_XATTR; continue; -- cgit v1.2.3 From b8d1b8ee93df8ffbabbeadd65d39853cfad6d698 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 17 Aug 2017 15:33:10 -0400 Subject: cpuset: Allow v2 behavior in v1 cgroup Cpuset v2 has some useful behaviors that are not present in v1 because of backward compatibility concern. One of that is the restoration of the original cpu and memory node mask after a hot removal and addition event sequence. This patch makes the cpuset controller to check the CGRP_ROOT_CPUSET_V2_MODE flag and use the v2 behavior if it is set. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 252d70c9a49b..f3539a41c49d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -298,6 +298,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); +/* + * Cgroup v2 behavior is used when on default hierarchy or the + * cgroup_v2_mode flag is set. + */ +static inline bool is_in_v2_mode(void) +{ + return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); +} + /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -488,8 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) goto out; /* @@ -895,8 +904,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - cpumask_empty(new_cpus)) + if (is_in_v2_mode() && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -913,7 +921,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1149,8 +1157,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - nodes_empty(*new_mems)) + if (is_in_v2_mode() && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1167,7 +1174,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1459,7 +1466,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + if (!is_in_v2_mode() && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1977,7 +1984,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2054,7 +2061,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2248,7 +2255,7 @@ retry: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2279,7 +2286,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + bool on_dfl = is_in_v2_mode(); mutex_lock(&cpuset_mutex); -- cgit v1.2.3 From ae2b27b859a144f503d382580320873c0beb09c7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Aug 2017 10:27:02 +0300 Subject: bpf: fix a return in sockmap_get_from_fd() "map" is a valid pointer. We wanted to return "err" instead. Also let's return a zero literal at the end. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: Dan Carpenter Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d2f2bdf71ffa..b8cb1b3c9bfb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1125,11 +1125,11 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) fdput(f); bpf_prog_put(prog1); bpf_prog_put(prog2); - return PTR_ERR(map); + return err; } fdput(f); - return err; + return 0; } static int bpf_prog_attach(const union bpf_attr *attr) -- cgit v1.2.3 From 2ba293c9e7db150943f06b12d3eb7213e7fae624 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 18 Aug 2017 15:15:58 -0700 Subject: kmod: fix wait on recursive loop Recursive loops with module loading were previously handled in kmod by restricting the number of modprobe calls to 50 and if that limit was breached request_module() would return an error and a user would see the following on their kernel dmesg: request_module: runaway loop modprobe binfmt-464c Starting init:/sbin/init exists but couldn't execute it (error -8) This issue could happen for instance when a 64-bit kernel boots a 32-bit userspace on some architectures and has no 32-bit binary format hanlders. This is visible, for instance, when a CONFIG_MODULES enabled 64-bit MIPS kernel boots a into o32 root filesystem and the binfmt handler for o32 binaries is not built-in. After commit 6d7964a722af ("kmod: throttle kmod thread limit") we now don't have any visible signs of an error and the kernel just waits for the loop to end somehow. Although this *particular* recursive loop could also be addressed by doing a sanity check on search_binary_handler() and disallowing a modular binfmt to be required for modprobe, a generic solution for any recursive kernel kmod issues is still needed. This should catch these loops. We can investigate each loop and address each one separately as they come in, this however puts a stop gap for them as before. Link: http://lkml.kernel.org/r/20170809234635.13443-3-mcgrof@kernel.org Fixes: 6d7964a722af ("kmod: throttle kmod thread limit") Signed-off-by: Luis R. Rodriguez Reported-by: Matt Redfearn Tested-by: Matt Redfearn Cc: "Eric W. Biederman" Cc: Colin Ian King Cc: Dan Carpenter Cc: Daniel Mentz Cc: David Binderman Cc: Dmitry Torokhov Cc: Ingo Molnar Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Michal Marek Cc: Miroslav Benes Cc: Peter Zijlstra (Intel) Cc: Petr Mladek Cc: Rusty Russell Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 6d016c5d97c8..2f37acde640b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -70,6 +70,18 @@ static DECLARE_RWSEM(umhelper_sem); static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); +/* + * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads + * running at the same time without returning. When this happens we + * believe you've somehow ended up with a recursive module dependency + * creating a loop. + * + * We have no option but to fail. + * + * Userspace should proactively try to detect and prevent these. + */ +#define MAX_KMOD_ALL_BUSY_TIMEOUT 5 + /* modprobe_path is set via /proc/sys. */ @@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...) pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", atomic_read(&kmod_concurrent_max), MAX_KMOD_CONCURRENT, module_name); - wait_event_interruptible(kmod_wq, - atomic_dec_if_positive(&kmod_concurrent_max) >= 0); + ret = wait_event_killable_timeout(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0, + MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); + if (!ret) { + pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", + module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); + return -ETIME; + } else if (ret == -ERESTARTSYS) { + pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); + return ret; + } } trace_module_request(module_name, wait, _RET_IP_); -- cgit v1.2.3 From eb61b5911bdc923875cde99eb25203a0e2b06d43 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Fri, 18 Aug 2017 15:16:18 -0700 Subject: signal: don't remove SIGNAL_UNKILLABLE for traced tasks. When forcing a signal, SIGNAL_UNKILLABLE is removed to prevent recursive faults, but this is undesirable when tracing. For example, debugging an init process (whether global or namespace), hitting a breakpoint and SIGTRAP will force SIGTRAP and then remove SIGNAL_UNKILLABLE. Everything continues fine, but then once debugging has finished, the init process is left killable which is unlikely what the user expects, resulting in either an accidentally killed init or an init that stops reaping zombies. Link: http://lkml.kernel.org/r/20170815112806.10728-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7e33f8c583e6..ed804a470dcd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) recalc_sigpending_and_wake(t); } } - if (action->sa.sa_handler == SIG_DFL) + /* + * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect + * debugging to leave init killable. + */ + if (action->sa.sa_handler == SIG_DFL && !t->ptrace) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); -- cgit v1.2.3 From 16a4362573782115096799aebd9862f8bb140169 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 17 Aug 2017 18:14:43 -0700 Subject: bpf: Fix map-in-map checking in the verifier In check_map_func_compatibility(), a 'break' has been accidentally removed for the BPF_MAP_TYPE_ARRAY_OF_MAPS and BPF_MAP_TYPE_HASH_OF_MAPS cases. This patch adds it back. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Cc: John Fastabend Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 40f669ddb571..4f6e7eb42ba0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1523,6 +1523,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) goto error; + break; case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && -- cgit v1.2.3 From 96eabe7a40aa17e613cf3db2c742ee8b1fc764d0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 18 Aug 2017 11:28:00 -0700 Subject: bpf: Allow selecting numa node during map creation The current map creation API does not allow to provide the numa-node preference. The memory usually comes from where the map-creation-process is running. The performance is not ideal if the bpf_prog is known to always run in a numa node different from the map-creation-process. One of the use case is sharding on CPU to different LRU maps (i.e. an array of LRU maps). Here is the test result of map_perf_test on the INNER_LRU_HASH_PREALLOC test if we force the lru map used by CPU0 to be allocated from a remote numa node: [ The machine has 20 cores. CPU0-9 at node 0. CPU10-19 at node 1 ] ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1628380 events per sec 4:inner_lru_hash_map_perf pre-alloc 1626396 events per sec 3:inner_lru_hash_map_perf pre-alloc 1626144 events per sec 6:inner_lru_hash_map_perf pre-alloc 1621657 events per sec 2:inner_lru_hash_map_perf pre-alloc 1621534 events per sec 1:inner_lru_hash_map_perf pre-alloc 1620292 events per sec 7:inner_lru_hash_map_perf pre-alloc 1613305 events per sec 0:inner_lru_hash_map_perf pre-alloc 1239150 events per sec #<<< After specifying numa node: ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1629627 events per sec 3:inner_lru_hash_map_perf pre-alloc 1628057 events per sec 1:inner_lru_hash_map_perf pre-alloc 1623054 events per sec 6:inner_lru_hash_map_perf pre-alloc 1616033 events per sec 2:inner_lru_hash_map_perf pre-alloc 1614630 events per sec 4:inner_lru_hash_map_perf pre-alloc 1612651 events per sec 7:inner_lru_hash_map_perf pre-alloc 1609337 events per sec 0:inner_lru_hash_map_perf pre-alloc 1619340 events per sec #<<< This patch adds one field, numa_node, to the bpf_attr. Since numa node 0 is a valid node, a new flag BPF_F_NUMA_NODE is also added. The numa_node field is honored if and only if the BPF_F_NUMA_NODE flag is set. Numa node selection is not supported for percpu map. This patch does not change all the kmalloc. F.e. 'htab = kzalloc()' is not changed since the object is small enough to stay in the cache. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 7 +++++-- kernel/bpf/devmap.c | 9 ++++++--- kernel/bpf/hashtab.c | 19 +++++++++++++++---- kernel/bpf/lpm_trie.c | 9 +++++++-- kernel/bpf/sockmap.c | 10 +++++++--- kernel/bpf/stackmap.c | 8 +++++--- kernel/bpf/syscall.c | 14 ++++++++++---- 7 files changed, 55 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d771a3872500..96e9c5c1dfc9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_array *array; u64 array_size; u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0 || attr->map_flags) + attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE || + (percpu && numa_node != NUMA_NO_NODE)) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) @@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size); + array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); @@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; array->map.map_flags = attr->map_flags; + array->map.numa_node = numa_node; array->elem_size = elem_size; if (!percpu) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 18a72a8add43..67f4f00ce33a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -80,7 +80,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags) + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); dtab = kzalloc(sizeof(*dtab), GFP_USER); @@ -93,6 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) dtab->map.value_size = attr->value_size; dtab->map.max_entries = attr->max_entries; dtab->map.map_flags = attr->map_flags; + dtab->map.numa_node = bpf_map_attr_numa_node(attr); err = -ENOMEM; @@ -119,7 +120,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) goto free_dtab; dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * - sizeof(struct bpf_dtab_netdev *)); + sizeof(struct bpf_dtab_netdev *), + dtab->map.numa_node); if (!dtab->netdev_map) goto free_dtab; @@ -344,7 +346,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN); + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + map->numa_node); if (!dev) return -ENOMEM; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4fb463172aa8..47ae748c3a49 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -18,6 +18,9 @@ #include "bpf_lru_list.h" #include "map_in_map.h" +#define HTAB_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE) + struct bucket { struct hlist_nulls_head head; raw_spinlock_t lock; @@ -138,7 +141,8 @@ static int prealloc_init(struct bpf_htab *htab) if (!htab_is_percpu(htab) && !htab_is_lru(htab)) num_entries += num_possible_cpus(); - htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries); + htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, + htab->map.numa_node); if (!htab->elems) return -ENOMEM; @@ -233,6 +237,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_htab *htab; int err, i; u64 cost; @@ -248,7 +253,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ return ERR_PTR(-EPERM); - if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ return ERR_PTR(-EINVAL); @@ -258,6 +263,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (lru && !prealloc) return ERR_PTR(-ENOTSUPP); + if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) + return ERR_PTR(-EINVAL); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -268,6 +276,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; htab->map.map_flags = attr->map_flags; + htab->map.numa_node = numa_node; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -346,7 +355,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * - sizeof(struct bucket)); + sizeof(struct bucket), + htab->map.numa_node); if (!htab->buckets) goto free_htab; @@ -689,7 +699,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!l_new) return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b09185f0f17d..1b767844a76f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -244,7 +244,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, if (value) size += trie->map.value_size; - node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN, + trie->map.numa_node); if (!node) return NULL; @@ -405,6 +406,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key) #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) +#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE) + static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; @@ -416,7 +419,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || - attr->map_flags != BPF_F_NO_PREALLOC || + !(attr->map_flags & BPF_F_NO_PREALLOC) || + attr->map_flags & ~LPM_CREATE_FLAG_MASK || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || @@ -433,6 +437,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; trie->map.map_flags = attr->map_flags; + trie->map.numa_node = bpf_map_attr_numa_node(attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 39de541fbcdc..78b2bb9370ac 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -443,7 +443,9 @@ static struct smap_psock *smap_init_psock(struct sock *sock, { struct smap_psock *psock; - psock = kzalloc(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN); + psock = kzalloc_node(sizeof(struct smap_psock), + GFP_ATOMIC | __GFP_NOWARN, + stab->map.numa_node); if (!psock) return ERR_PTR(-ENOMEM); @@ -465,7 +467,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags) + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) @@ -481,6 +483,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) stab->map.value_size = attr->value_size; stab->map.max_entries = attr->max_entries; stab->map.map_flags = attr->map_flags; + stab->map.numa_node = bpf_map_attr_numa_node(attr); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); @@ -495,7 +498,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) goto free_stab; stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * - sizeof(struct sock *)); + sizeof(struct sock *), + stab->map.numa_node); if (!stab->sock_map) goto free_stab; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 31147d730abf..135be433e9a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -31,7 +31,8 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; int err; - smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries); + smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, + smap->map.numa_node); if (!smap->elems) return -ENOMEM; @@ -59,7 +60,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_flags) + if (attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); /* check sanity of attributes */ @@ -75,7 +76,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); - smap = bpf_map_area_alloc(cost); + smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); @@ -91,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b8cb1b3c9bfb..9378f3ba2cbf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -105,7 +105,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(size_t size) +void *bpf_map_area_alloc(size_t size, int numa_node) { /* We definitely need __GFP_NORETRY, so OOM killer doesn't * trigger under memory pressure as we really just want to @@ -115,12 +115,13 @@ void *bpf_map_area_alloc(size_t size) void *area; if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc(size, GFP_USER | flags); + area = kmalloc_node(size, GFP_USER | flags, numa_node); if (area != NULL) return area; } - return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL); + return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, + __builtin_return_address(0)); } void bpf_map_area_free(void *area) @@ -309,10 +310,11 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd +#define BPF_MAP_CREATE_LAST_FIELD numa_node /* called via syscall */ static int map_create(union bpf_attr *attr) { + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_map *map; int err; @@ -320,6 +322,10 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (numa_node != NUMA_NO_NODE && + (numa_node >= nr_node_ids || !node_online(numa_node))) + return -EINVAL; + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map = find_and_alloc_map(attr); if (IS_ERR(map)) -- cgit v1.2.3 From 89c63074c2bc25874e4e72406ff15a9a8e3df750 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 19 Aug 2017 03:12:45 +0200 Subject: bpf: make htab inlining more robust wrt assumptions Commit 9015d2f59535 ("bpf: inline htab_map_lookup_elem()") was making the assumption that a direct call emission to the function __htab_map_lookup_elem() will always work out for JITs. This is currently true since all JITs we have are for 64 bit archs, but in case of 32 bit JITs like upcoming arm32, we get a NULL pointer dereference when executing the call to __htab_map_lookup_elem() since passed arguments are of a different size (due to pointer args) than what we do out of BPF. Guard and thus limit this for now for the current 64 bit JITs only. Reported-by: Shubham Bansal Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4f6e7eb42ba0..e42c096ba20d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4160,7 +4160,11 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } - if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) { + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup + * handlers are currently limited to 64 bit only. + */ + if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON || !map_ptr->ops->map_gen_lookup) -- cgit v1.2.3 From 7b0c2a0508b90fce79d3782b2e55d0e8bf6a283e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 19 Aug 2017 03:12:46 +0200 Subject: bpf: inline map in map lookup functions for array and htab Avoid two successive functions calls for the map in map lookup, first is the bpf_map_lookup_elem() helper call, and second the callback via map->ops->map_lookup_elem() to get to the map in map implementation. Implementation inlines array and htab flavor for map in map lookups. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 26 ++++++++++++++++++++++++++ kernel/bpf/hashtab.c | 17 +++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 96e9c5c1dfc9..98c0f00c3f5e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -606,6 +606,31 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } +static u32 array_of_map_gen_lookup(struct bpf_map *map, + struct bpf_insn *insn_buf) +{ + u32 elem_size = round_up(map->value_size, 8); + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + const int map_ptr = BPF_REG_1; + const int index = BPF_REG_2; + + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (is_power_of_2(elem_size)) + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); + else + *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); + *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + + return insn - insn_buf; +} + const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, @@ -615,4 +640,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, + .map_gen_lookup = array_of_map_gen_lookup, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 47ae748c3a49..ae822de4a90a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1322,6 +1322,22 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } +static u32 htab_of_map_gen_lookup(struct bpf_map *map, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct htab_elem, key) + + round_up(map->key_size, 8)); + *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); + + return insn - insn_buf; +} + static void htab_of_map_free(struct bpf_map *map) { bpf_map_meta_free(map->inner_map_meta); @@ -1337,4 +1353,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, + .map_gen_lookup = htab_of_map_gen_lookup, }; -- cgit v1.2.3 From 8fbbe2d7cc478d1544f41f2271787c993c23a4f6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 19 Aug 2017 12:57:51 +0300 Subject: genirq/ipi: Fixup checks against nr_cpu_ids Valid CPU ids are [0, nr_cpu_ids-1] inclusive. Fixes: 3b8e29a82dd1 ("genirq: Implement ipi_send_mask/single()") Fixes: f9bce791ae2a ("genirq: Add a new function to get IPI reverse mapping") Signed-off-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170819095751.GB27864@avx2 --- kernel/irq/ipi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 1a9abc1c8ea0..259a22aa9934 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) struct irq_data *data = irq_get_irq_data(irq); struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; - if (!data || !ipimask || cpu > nr_cpu_ids) + if (!data || !ipimask || cpu >= nr_cpu_ids) return INVALID_HWIRQ; if (!cpumask_test_cpu(cpu, ipimask)) @@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, if (!chip->ipi_send_single && !chip->ipi_send_mask) return -EINVAL; - if (cpu > nr_cpu_ids) + if (cpu >= nr_cpu_ids) return -EINVAL; if (dest) { -- cgit v1.2.3 From 274043c6c95636e62f5b2514e78fdba82eb47601 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 21 Aug 2017 01:48:12 +0200 Subject: bpf: fix double free from dev_map_notification() In the current code, dev_map_free() can still race with dev_map_notification(). In dev_map_free(), we remove dtab from the list of dtabs after we purged all entries from it. However, we don't do xchg() with NULL or the like, so the entry at that point is still pointing to the device. If a unregister notification comes in at the same time, we therefore risk a double-free, since the pointer is still present in the map, and then pushed again to __dev_map_entry_free(). All this is completely unnecessary. Just remove the dtab from the list right before the synchronize_rcu(), so all outstanding readers from the notifier list have finished by then, thus we don't need to deal with this corner case anymore and also wouldn't need to nullify dev entires. This is fine because we iterate over the map releasing all entries and therefore dev references anyway. Fixes: 4cc7b9544b9a ("bpf: devmap fix mutex in rcu critical section") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 67f4f00ce33a..fa08181d1c3d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -148,6 +148,11 @@ static void dev_map_free(struct bpf_map *map) * no further reads against netdev_map. It does __not__ ensure pending * flush operations (if any) are complete. */ + + spin_lock(&dev_map_lock); + list_del_rcu(&dtab->list); + spin_unlock(&dev_map_lock); + synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush @@ -162,10 +167,6 @@ static void dev_map_free(struct bpf_map *map) cpu_relax(); } - /* Although we should no longer have datapath or bpf syscall operations - * at this point we we can still race with netdev notifier, hence the - * lock. - */ for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -180,9 +181,6 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ - spin_lock(&dev_map_lock); - list_del_rcu(&dtab->list); - spin_unlock(&dev_map_lock); free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); -- cgit v1.2.3 From dd1c1f2f2028a7b851f701fc6a8ebe39dcb95e7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2017 17:35:02 +0200 Subject: pids: make task_tgid_nr_ns() safe This was reported many times, and this was even mentioned in commit 52ee2dfdd4f5 ("pids: refactor vnr/nr_ns helpers to make them safe") but somehow nobody bothered to fix the obvious problem: task_tgid_nr_ns() is not safe because task->group_leader points to nowhere after the exiting task passes exit_notify(), rcu_read_lock() can not help. We really need to change __unhash_process() to nullify group_leader, parent, and real_parent, but this needs some cleanups. Until then we can turn task_tgid_nr_ns() into another user of __task_pid_nr_ns() and fix the problem. Reported-by: Troy Kensinger Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/pid.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index c69c30d827e5..020dedbdf066 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); -- cgit v1.2.3 From cd36c3a21a400cac9c457394b9adf94e0027c136 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 23 Aug 2017 00:06:09 +0200 Subject: bpf: fix map value attribute for hash of maps Currently, iproute2's BPF ELF loader works fine with array of maps when retrieving the fd from a pinned node and doing a selfcheck against the provided map attributes from the object file, but we fail to do the same for hash of maps and thus refuse to get the map from pinned node. Reason is that when allocating hash of maps, fd_htab_map_alloc() will set the value size to sizeof(void *), and any user space map creation requests are forced to set 4 bytes as value size. Thus, selfcheck will complain about exposed 8 bytes on 64 bit archs vs. 4 bytes from object file as value size. Contract is that fdinfo or BPF_MAP_GET_FD_BY_ID returns the value size used to create the map. Fix it by handling it the same way as we do for array of maps, which means that we leave value size at 4 bytes and in the allocation phase round up value size to 8 bytes. alloc_htab_elem() needs an adjustment in order to copy rounded up 8 bytes due to bpf_fd_htab_map_update_elem() calling into htab_map_update_elem() with the pointer of the map pointer as value. Unlike array of maps where we just xchg(), we're using the generic htab_map_update_elem() callback also used from helper calls, which published the key/value already on return, so we need to ensure to memcpy() the right size. Fixes: bcc6b1b7ebf8 ("bpf: Add hash of maps support") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ae822de4a90a..d246905f2bb1 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -662,12 +662,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && + BITS_PER_LONG == 64; +} + +static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) +{ + u32 size = htab->map.value_size; + + if (percpu || fd_htab_map_needs_adjust(htab)) + size = round_up(size, 8); + return size; +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab->map.value_size; + u32 size = htab_size_value(htab, percpu); bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -707,9 +722,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - /* round up value_size to 8 bytes */ - size = round_up(size, 8); - if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -1220,17 +1232,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) { - struct bpf_map *map; - if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); - - /* pointer is stored internally */ - attr->value_size = sizeof(void *); - map = htab_map_alloc(attr); - attr->value_size = sizeof(u32); - - return map; + return htab_map_alloc(attr); } static void fd_htab_map_free(struct bpf_map *map) -- cgit v1.2.3 From 33ba43ed0afc13a29b1314e3e45a9938d310ba13 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 23 Aug 2017 00:06:09 +0200 Subject: bpf: fix map value attribute for hash of maps Currently, iproute2's BPF ELF loader works fine with array of maps when retrieving the fd from a pinned node and doing a selfcheck against the provided map attributes from the object file, but we fail to do the same for hash of maps and thus refuse to get the map from pinned node. Reason is that when allocating hash of maps, fd_htab_map_alloc() will set the value size to sizeof(void *), and any user space map creation requests are forced to set 4 bytes as value size. Thus, selfcheck will complain about exposed 8 bytes on 64 bit archs vs. 4 bytes from object file as value size. Contract is that fdinfo or BPF_MAP_GET_FD_BY_ID returns the value size used to create the map. Fix it by handling it the same way as we do for array of maps, which means that we leave value size at 4 bytes and in the allocation phase round up value size to 8 bytes. alloc_htab_elem() needs an adjustment in order to copy rounded up 8 bytes due to bpf_fd_htab_map_update_elem() calling into htab_map_update_elem() with the pointer of the map pointer as value. Unlike array of maps where we just xchg(), we're using the generic htab_map_update_elem() callback also used from helper calls, which published the key/value already on return, so we need to ensure to memcpy() the right size. Fixes: bcc6b1b7ebf8 ("bpf: Add hash of maps support") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4fb463172aa8..d11c8181f4c5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && + BITS_PER_LONG == 64; +} + +static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) +{ + u32 size = htab->map.value_size; + + if (percpu || fd_htab_map_needs_adjust(htab)) + size = round_up(size, 8); + return size; +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab->map.value_size; + u32 size = htab_size_value(htab, percpu); bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - /* round up value_size to 8 bytes */ - size = round_up(size, 8); - if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -1209,17 +1221,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) { - struct bpf_map *map; - if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); - - /* pointer is stored internally */ - attr->value_size = sizeof(void *); - map = htab_map_alloc(attr); - attr->value_size = sizeof(u32); - - return map; + return htab_map_alloc(attr); } static void fd_htab_map_free(struct bpf_map *map) -- cgit v1.2.3 From 9e18d0c82f0c07f2a41898d4adbb698a953403ee Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 28 Jul 2017 13:56:07 +0200 Subject: ANDROID: binder: add hwbinder,vndbinder to BINDER_DEVICES. These will be required going forward. Signed-off-by: Martijn Coenen Cc: stable # 4.11+ Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index d70829033bb7..d3fd428f4b92 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -10,6 +10,7 @@ # CONFIG_USELIB is not set CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder CONFIG_ANDROID_LOW_MEMORY_KILLER=y CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y -- cgit v1.2.3 From af4d045ceeca04946d89453206269aea6c338a8e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 23 Aug 2017 01:47:54 +0200 Subject: bpf: minor cleanups for dev_map Some minor code cleanups, while going over it I also noticed that we're accounting the bitmap only for one CPU currently, so fix that up as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 100 +++++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index fa08181d1c3d..bfecabfd4974 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,30 +48,30 @@ * calls will fail at this point. */ #include -#include #include -#include -#include "percpu_freelist.h" -#include "bpf_lru_list.h" -#include "map_in_map.h" struct bpf_dtab_netdev { struct net_device *dev; - int key; - struct rcu_head rcu; struct bpf_dtab *dtab; + unsigned int bit; + struct rcu_head rcu; }; struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long int __percpu *flush_needed; + unsigned long __percpu *flush_needed; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); +static u64 dev_map_bitmap_size(const union bpf_attr *attr) +{ + return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +} + static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; @@ -95,11 +95,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) dtab->map.map_flags = attr->map_flags; dtab->map.numa_node = bpf_map_attr_numa_node(attr); - err = -ENOMEM; - /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); + cost += dev_map_bitmap_size(attr) * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; @@ -110,12 +108,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; - err = -ENOMEM; /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu( - BITS_TO_LONGS(attr->max_entries) * - sizeof(unsigned long), - __alignof__(unsigned long)); + dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), + __alignof__(unsigned long)); if (!dtab->flush_needed) goto free_dtab; @@ -128,12 +123,12 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); - return &dtab->map; + return &dtab->map; free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); - return ERR_PTR(err); + return ERR_PTR(-ENOMEM); } static void dev_map_free(struct bpf_map *map) @@ -178,9 +173,6 @@ static void dev_map_free(struct bpf_map *map) kfree(dev); } - /* At this point bpf program is detached and all pending operations - * _must_ be complete - */ free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); @@ -190,7 +182,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 index = key ? *(u32 *)key : U32_MAX; - u32 *next = (u32 *)next_key; + u32 *next = next_key; if (index >= dtab->map.max_entries) { *next = 0; @@ -199,29 +191,16 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) if (index == dtab->map.max_entries - 1) return -ENOENT; - *next = index + 1; return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 key) +void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - __set_bit(key, bitmap); -} - -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *dev; - - if (key >= map->max_entries) - return NULL; - - dev = READ_ONCE(dtab->netdev_map[key]); - return dev ? dev->dev : NULL; + __set_bit(bit, bitmap); } /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled @@ -248,7 +227,6 @@ void __dev_map_flush(struct bpf_map *map) continue; netdev = dev->dev; - __clear_bit(bit, bitmap); if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush)) continue; @@ -261,43 +239,49 @@ void __dev_map_flush(struct bpf_map *map) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -static void *dev_map_lookup_elem(struct bpf_map *map, void *key) +struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev; - u32 i = *(u32 *)key; - if (i >= map->max_entries) + if (key >= map->max_entries) return NULL; - dev = READ_ONCE(dtab->netdev_map[i]); - return dev ? &dev->dev->ifindex : NULL; + dev = READ_ONCE(dtab->netdev_map[key]); + return dev ? dev->dev : NULL; } -static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev) +static void *dev_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); + + return dev ? &dev->ifindex : NULL; +} + +static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { - if (old_dev->dev->netdev_ops->ndo_xdp_flush) { - struct net_device *fl = old_dev->dev; + if (dev->dev->netdev_ops->ndo_xdp_flush) { + struct net_device *fl = dev->dev; unsigned long *bitmap; int cpu; for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu); - __clear_bit(old_dev->key, bitmap); + bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); + __clear_bit(dev->bit, bitmap); - fl->netdev_ops->ndo_xdp_flush(old_dev->dev); + fl->netdev_ops->ndo_xdp_flush(dev->dev); } } } static void __dev_map_entry_free(struct rcu_head *rcu) { - struct bpf_dtab_netdev *old_dev; + struct bpf_dtab_netdev *dev; - old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - dev_map_flush_old(old_dev); - dev_put(old_dev->dev); - kfree(old_dev); + dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + dev_map_flush_old(dev); + dev_put(dev->dev); + kfree(dev); } static int dev_map_delete_elem(struct bpf_map *map, void *key) @@ -309,8 +293,8 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - /* Use synchronize_rcu() here to ensure any rcu critical sections - * have completed, but this does not guarantee a flush has happened + /* Use call_rcu() here to ensure any rcu critical sections have + * completed, but this does not guarantee a flush has happened * yet. Because driver side rcu_read_lock/unlock only protects the * running XDP program. However, for pending flush operations the * dev and ctx are stored in another per cpu map. And additionally, @@ -334,10 +318,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; - if (unlikely(i >= dtab->map.max_entries)) return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; @@ -355,7 +337,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -EINVAL; } - dev->key = i; + dev->bit = i; dev->dtab = dtab; } -- cgit v1.2.3 From 0abce64a55ae44d39b92f8e672736f4f324e610f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 23 Jun 2017 21:42:57 +0100 Subject: genirq: Let irq_set_vcpu_affinity() iterate over hierarchy When assigning an interrupt to a vcpu, it is not unlikely that the level of the hierarchy implementing irq_set_vcpu_affinity is not the top level (think a generic MSI domain on top of a virtualization aware interrupt controller). In such a case, let's iterate over the hierarchy until we find an irqchip implementing it. Reviewed-by: Thomas Gleixner Signed-off-by: Marc Zyngier --- kernel/irq/manage.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1d1a5b945ab4..573dc52b0806 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -400,8 +400,18 @@ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) return -EINVAL; data = irq_desc_get_irq_data(desc); - chip = irq_data_get_irq_chip(data); - if (chip && chip->irq_set_vcpu_affinity) + do { + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + break; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + data = data->parent_data; +#else + data = NULL; +#endif + } while (data); + + if (data) ret = chip->irq_set_vcpu_affinity(data, vcpu_info); irq_put_desc_unlock(desc, flags); -- cgit v1.2.3 From c5a94a618e7ac86b20f53d947f68d7cee6a4c6bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Aug 2017 13:58:44 +0200 Subject: workqueue: Use TASK_IDLE Workqueues don't use signals, it (ab)uses TASK_INTERRUPTIBLE to avoid increasing the loadavg numbers. We've 'recently' introduced TASK_IDLE for this case: 80ed87c8a9ca ("sched/wait: Introduce TASK_NOLOAD and TASK_IDLE") use it. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4fa6c7650f09..2d278b9a5469 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2247,7 +2247,7 @@ sleep: * event. */ worker_enter_idle(worker); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_IDLE); spin_unlock_irq(&pool->lock); schedule(); goto woke_up; @@ -2289,7 +2289,7 @@ static int rescuer_thread(void *__rescuer) */ rescuer->task->flags |= PF_WQ_WORKER; repeat: - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_IDLE); /* * By the time the rescuer is requested to stop, the workqueue -- cgit v1.2.3 From 74d46992e0d9dee7f1f376de0d56d31614c8a17a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:32 +0200 Subject: block: replace bi_bdev with a gendisk pointer and partitions index This way we don't need a block_device structure to submit I/O. The block_device has different life time rules from the gendisk and request_queue and is usually only available when the block device node is open. Other callers need to explicitly create one (e.g. the lightnvm passthrough code, or the new nvme multipathing code). For the actual I/O path all that we need is the gendisk, which exists once per block device. But given that the block layer also does partition remapping we additionally need a partition index, which is used for said remapping in generic_make_request. Note that all the block drivers generally want request_queue or sometimes the gendisk, so this removes a layer of indirection all over the stack. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/power/swap.c | 5 ++--- kernel/trace/blktrace.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 57d22571f306..d7cdc426ee38 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio) if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } @@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = hib_resume_bdev; + bio_set_dev(bio, hib_resume_bdev); bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7724de18d2fe..2a685b45b73b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -963,7 +963,7 @@ static void blk_add_trace_bio_remap(void *ignore, return; r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.device_to = cpu_to_be32(bio_dev(bio)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, -- cgit v1.2.3 From 63f45f840634ab5fd71bbc07acff915277764068 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 23 Aug 2017 15:10:03 +0100 Subject: bpf/verifier: when pruning a branch, ignore its write marks The fact that writes occurred in reaching the continuation state does not screen off its reads from us, because we're not really its parent. So detect 'not really the parent' in do_propagate_liveness, and ignore write marks in that case. Fixes: dc503a8ad984 ("bpf/verifier: track liveness for pruning") Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e42c096ba20d..fdbaa6086559 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3436,6 +3436,7 @@ out_free: static bool do_propagate_liveness(const struct bpf_verifier_state *state, struct bpf_verifier_state *parent) { + bool writes = parent == state->parent; /* Observe write marks */ bool touched = false; /* any changes made? */ int i; @@ -3447,7 +3448,9 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state, for (i = 0; i < BPF_REG_FP; i++) { if (parent->regs[i].live & REG_LIVE_READ) continue; - if (state->regs[i].live == REG_LIVE_READ) { + if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) + continue; + if (state->regs[i].live & REG_LIVE_READ) { parent->regs[i].live |= REG_LIVE_READ; touched = true; } @@ -3460,7 +3463,9 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state, continue; if (parent->spilled_regs[i].live & REG_LIVE_READ) continue; - if (state->spilled_regs[i].live == REG_LIVE_READ) { + if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN)) + continue; + if (state->spilled_regs[i].live & REG_LIVE_READ) { parent->spilled_regs[i].live |= REG_LIVE_READ; touched = true; } -- cgit v1.2.3 From 1b688a19a92223cf2d1892c9d05d64dc397b33e3 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 23 Aug 2017 15:10:50 +0100 Subject: bpf/verifier: remove varlen_map_value_access flag The optimisation it does is broken when the 'new' register value has a variable offset and the 'old' was constant. I broke it with my pointer types unification (see Fixes tag below), before which the 'new' value would have type PTR_TO_MAP_VALUE_ADJ and would thus not compare equal; other changes in that patch mean that its original behaviour (ignore min/max values) cannot be restored. Tests on a sample set of cilium programs show no change in count of processed instructions. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 41 ++++++++++++----------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fdbaa6086559..711bdbd22cea 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -832,11 +832,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, */ if (log_level) print_verifier_state(state); - /* If the offset is variable, we will need to be stricter in state - * pruning from now on. - */ - if (!tnum_is_const(reg->var_off)) - env->varlen_map_value_access = true; /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our @@ -3247,9 +3242,8 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) } /* Returns true if (rold safe implies rcur safe) */ -static bool regsafe(struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, - bool varlen_map_access, struct idpair *idmap) +static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, + struct idpair *idmap) { if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ @@ -3281,22 +3275,14 @@ static bool regsafe(struct bpf_reg_state *rold, tnum_is_unknown(rold->var_off); } case PTR_TO_MAP_VALUE: - if (varlen_map_access) { - /* If the new min/max/var_off satisfy the old ones and - * everything else matches, we are OK. - * We don't care about the 'id' value, because nothing - * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) - */ - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && - range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); - } else { - /* If the ranges/var_off were not the same, but - * everything else was and we didn't do a variable - * access into a map then we are a-ok. - */ - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0; - } + /* If the new min/max/var_off satisfy the old ones and + * everything else matches, we are OK. + * We don't care about the 'id' value, because nothing + * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) + */ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); case PTR_TO_MAP_VALUE_OR_NULL: /* a PTR_TO_MAP_VALUE could be safe to use as a * PTR_TO_MAP_VALUE_OR_NULL into the same map. @@ -3380,7 +3366,6 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { - bool varlen_map_access = env->varlen_map_value_access; struct idpair *idmap; bool ret = false; int i; @@ -3391,8 +3376,7 @@ static bool states_equal(struct bpf_verifier_env *env, return false; for (i = 0; i < MAX_BPF_REG; i++) { - if (!regsafe(&old->regs[i], &cur->regs[i], varlen_map_access, - idmap)) + if (!regsafe(&old->regs[i], &cur->regs[i], idmap)) goto out_free; } @@ -3412,7 +3396,7 @@ static bool states_equal(struct bpf_verifier_env *env, continue; if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE], &cur->spilled_regs[i / BPF_REG_SIZE], - varlen_map_access, idmap)) + idmap)) /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -3555,7 +3539,6 @@ static int do_check(struct bpf_verifier_env *env) init_reg_state(regs); state->parent = NULL; insn_idx = 0; - env->varlen_map_value_access = false; for (;;) { struct bpf_insn *insn; u8 class; -- cgit v1.2.3 From 8e9cd9ce90d48369b2c5ddd79fe3d4a4cb1ccb56 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 23 Aug 2017 15:11:21 +0100 Subject: bpf/verifier: document liveness analysis The liveness tracking algorithm is quite subtle; add comments to explain it. Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 711bdbd22cea..d690c7dd1f1a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3417,6 +3417,12 @@ out_free: return ret; } +/* A write screens off any subsequent reads; but write marks come from the + * straight-line code between a state and its parent. When we arrive at a + * jump target (in the first iteration of the propagate_liveness() loop), + * we didn't arrive by the straight-line code, so read marks in state must + * propagate to parent regardless of state's write marks. + */ static bool do_propagate_liveness(const struct bpf_verifier_state *state, struct bpf_verifier_state *parent) { @@ -3457,6 +3463,15 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state, return touched; } +/* "parent" is "a state from which we reach the current state", but initially + * it is not the state->parent (i.e. "the state whose straight-line code leads + * to the current state"), instead it is the state that happened to arrive at + * a (prunable) equivalent of the current state. See comment above + * do_propagate_liveness() for consequences of this. + * This function is just a more efficient way of calling mark_reg_read() or + * mark_stack_slot_read() on each reg in "parent" that is read in "state", + * though it requires that parent != state->parent in the call arguments. + */ static void propagate_liveness(const struct bpf_verifier_state *state, struct bpf_verifier_state *parent) { @@ -3485,6 +3500,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. + * If we have any write marks in env->cur_state, they + * will prevent corresponding reads in the continuation + * from reaching our parent (an explored_state). Our + * own state will get the read marks recorded, but + * they'll be immediately forgotten as we're pruning + * this state and will pop a new one. */ propagate_liveness(&sl->state, &env->cur_state); return 1; @@ -3508,7 +3529,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ env->cur_state.parent = &new_sl->state; - /* clear liveness marks in current state */ + /* clear write marks in current state: the writes we did are not writes + * our child did, so they don't screen off its reads from us. + * (There are no read marks in current state, because reads always mark + * their parent and current state never has children yet. Only + * explored_states can get read marks.) + */ for (i = 0; i < BPF_REG_FP; i++) env->cur_state.regs[i].live = REG_LIVE_NONE; for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) -- cgit v1.2.3 From a5e2da6e9787187ff104c34aa048419703c1f9cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 24 Aug 2017 03:20:11 +0200 Subject: bpf: netdev is never null in __dev_map_flush No need to test for it in fast-path, every dev in bpf_dtab_netdev is guaranteed to be non-NULL, otherwise dev_map_update_elem() will fail in the first place. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index bfecabfd4974..ecf9f99ecc57 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -226,12 +226,10 @@ void __dev_map_flush(struct bpf_map *map) if (unlikely(!dev)) continue; - netdev = dev->dev; __clear_bit(bit, bitmap); - if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush)) - continue; - - netdev->netdev_ops->ndo_xdp_flush(netdev); + netdev = dev->dev; + if (likely(netdev->netdev_ops->ndo_xdp_flush)) + netdev->netdev_ops->ndo_xdp_flush(netdev); } } -- cgit v1.2.3 From 2fe59f507a65dbd734b990a11ebc7488f6f87a24 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 22 Aug 2017 18:43:48 +1000 Subject: timers: Fix excessive granularity of new timers after a nohz idle When a timer base is idle, it is forwarded when a new timer is added to ensure that granularity does not become excessive. When not idle, the timer tick is expected to increment the base. However there are several problems: - If an existing timer is modified, the base is forwarded only after the index is calculated. - The base is not forwarded by add_timer_on. - There is a window after a timer is restarted from a nohz idle, after it is marked not-idle and before the timer tick on this CPU, where a timer may be added but the ancient base does not get forwarded. These result in excessive granularity (a 1 jiffy timeout can blow out to 100s of jiffies), which cause the rcu lockup detector to trigger, among other things. Fix this by keeping track of whether the timer base has been idle since it was last run or forwarded, and if so then forward it before adding a new timer. There is still a case where mod_timer optimises the case of a pending timer mod with the same expiry time, where the timer can see excessive granularity relative to the new, shorter interval. A comment is added, but it's not changed because it is an important fastpath for networking. This has been tested and found to fix the RCU softlockup messages. Testing was also done with tracing to measure requested versus achieved wakeup latencies for all non-deferrable timers in an idle system (with no lockup watchdogs running). Wakeup latency relative to absolute latency is calculated (note this suffers from round-up skew at low absolute times) and analysed: max avg std upstream 506.0 1.20 4.68 patched 2.0 1.08 0.15 The bug was noticed due to the lockup detector Kconfig changes dropping it out of people's .configs and resulting in larger base clk skew When the lockup detectors are enabled, no CPU can go idle for longer than 4 seconds, which limits the granularity errors. Sub-optimal timer behaviour is observable on a smaller scale in that case: max avg std upstream 9.0 1.05 0.19 patched 2.0 1.04 0.11 Fixes: Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible") Signed-off-by: Nicholas Piggin Signed-off-by: Thomas Gleixner Tested-by: Jonathan Cameron Tested-by: David Miller Cc: dzickus@redhat.com Cc: sfr@canb.auug.org.au Cc: mpe@ellerman.id.au Cc: Stephen Boyd Cc: linuxarm@huawei.com Cc: abdhalee@linux.vnet.ibm.com Cc: John Stultz Cc: akpm@linux-foundation.org Cc: paulmck@linux.vnet.ibm.com Cc: torvalds@linux-foundation.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170822084348.21436-1-npiggin@gmail.com --- kernel/time/timer.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8f5d1bf18854..f2674a056c26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -203,6 +203,7 @@ struct timer_base { bool migration_enabled; bool nohz_active; bool is_idle; + bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { - unsigned long jnow = READ_ONCE(jiffies); + unsigned long jnow; /* - * We only forward the base when it's idle and we have a delta between - * base clock and jiffies. + * We only forward the base when we are idle or have just come out of + * idle (must_forward_clk logic), and have a delta between base clock + * and jiffies. In the common case, run_timers will take care of it. */ - if (!base->is_idle || (long) (jnow - base->clk) < 2) + if (likely(!base->must_forward_clk)) + return; + + jnow = READ_ONCE(jiffies); + base->must_forward_clk = base->is_idle; + if ((long)(jnow - base->clk) < 2) return; /* @@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * same array bucket then just return: */ if (timer_pending(timer)) { + /* + * The downside of this optimization is that it can result in + * larger granularity than you would get from adding a new + * timer with this expiry. + */ if (timer->expires == expires) return 1; @@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); + forward_timer_base(base); clk = base->clk; idx = calc_wheel_index(expires, clk); @@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } else { base = lock_timer_base(timer, &flags); + forward_timer_base(base); } ret = detach_if_pending(timer, base, false); @@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); + forward_timer_base(base); } } - /* Try to forward a stale timer base clock */ - forward_timer_base(base); - timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance @@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu) WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } + forward_timer_base(base); debug_activate(timer, timer->expires); internal_add_timer(base, timer); @@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (!is_max_delta) expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* - * If we expect to sleep more than a tick, mark the base idle: + * If we expect to sleep more than a tick, mark the base idle. + * Also the tick is stopped so any added timer must forward + * the base clk itself to keep granularity small. This idle + * logic is only maintained for the BASE_STD base, deferrable + * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) + if ((expires - basem) > TICK_NSEC) { + base->must_forward_clk = true; base->is_idle = true; + } } raw_spin_unlock(&base->lock); @@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + /* + * must_forward_clk must be cleared before running timers so that any + * timer functions that call mod_timer will not try to forward the + * base. idle trcking / clock forwarding logic is only used with + * BASE_STD timers. + * + * The deferrable base does not do idle tracking at all, so we do + * not forward it. This can result in very large variations in + * granularity for deferrable timers, but they can be deferred for + * long periods due to idle. + */ + base->must_forward_clk = false; + __run_timers(base); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); -- cgit v1.2.3 From a8f0f9e49956a74718874b800251455680085600 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 17 Aug 2017 16:37:25 -0400 Subject: ftrace: Check for null ret_stack on profile function graph entry function There's a small race when function graph shutsdown and the calling of the registered function graph entry callback. The callback must not reference the task's ret_stack without first checking that it is not NULL. Note, when a ret_stack is allocated for a task, it stays allocated until the task exits. The problem here, is that function_graph is shutdown, and a new task was created, which doesn't have its ret_stack allocated. But since some of the functions are still being traced, the callbacks can still be called. The normal function_graph code handles this, but starting with commit 8861dd303c ("ftrace: Access ret_stack->subtime only in the function profiler") the profiler code references the ret_stack on function entry, but doesn't check if it is NULL first. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196611 Cc: stable@vger.kernel.org Fixes: 8861dd303c ("ftrace: Access ret_stack->subtime only in the function profiler") Reported-by: lilydjwg@gmail.com Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02004ae91860..96cea88fa00f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) function_profile_call(trace->func, 0, NULL, NULL); + /* If function graph is shutting down, ret_stack can be NULL */ + if (!current->ret_stack) + return 0; + if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) current->ret_stack[index].subtime = 0; -- cgit v1.2.3 From 475bb3c69ab05df2a6ecef6acc2393703d134180 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Mon, 14 Aug 2017 18:18:17 +0800 Subject: tracing: Fix kmemleak in tracing_map_array_free() kmemleak reported the below leak when I was doing clear of the hist trigger. With this patch, the kmeamleak is gone. unreferenced object 0xffff94322b63d760 (size 32): comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s) hex dump (first 32 bytes): 00 01 00 00 04 00 00 00 08 00 00 00 ff 00 00 00 ................ 10 00 00 00 00 00 00 00 80 a8 7a f2 31 94 ff ff ..........z.1... backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_trace+0xca/0x1d0 [] tracing_map_array_alloc+0x26/0x140 [] kretprobe_trampoline+0x0/0x50 [] create_hist_data+0x535/0x750 [] event_hist_trigger_func+0x1f7/0x420 [] event_trigger_write+0xfd/0x1a0 [] __vfs_write+0x37/0x170 [] vfs_write+0xb2/0x1b0 [] SyS_write+0x55/0xc0 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff unreferenced object 0xffff9431f27aa880 (size 128): comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s) hex dump (first 32 bytes): 00 00 8c 2a 32 94 ff ff 00 f0 8b 2a 32 94 ff ff ...*2......*2... 00 e0 8b 2a 32 94 ff ff 00 d0 8b 2a 32 94 ff ff ...*2......*2... backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc+0xe8/0x220 [] tracing_map_array_alloc+0xb1/0x140 [] kretprobe_trampoline+0x0/0x50 [] create_hist_data+0x535/0x750 [] event_hist_trigger_func+0x1f7/0x420 [] event_trigger_write+0xfd/0x1a0 [] __vfs_write+0x37/0x170 [] vfs_write+0xb2/0x1b0 [] SyS_write+0x55/0xc0 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/1502705898-27571-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 08d43a5fa063 ("tracing: Add lock-free tracing_map") Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 0a689bbb78ef..305039b122fa 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a) if (!a) return; - if (!a->pages) { - kfree(a); - return; - } + if (!a->pages) + goto free; for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; free_page((unsigned long)a->pages[i]); } + + kfree(a->pages); + + free: + kfree(a); } struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, -- cgit v1.2.3 From 8b0db1a5bdfcee0dbfa89607672598ae203c9045 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 23 Aug 2017 12:46:27 -0400 Subject: tracing: Fix freeing of filter in create_filter() when set_str is false Performing the following task with kmemleak enabled: # cd /sys/kernel/tracing/events/irq/irq_handler_entry/ # echo 'enable_event:kmem:kmalloc:3 if irq >' > trigger # echo 'enable_event:kmem:kmalloc:3 if irq > 31' > trigger # echo scan > /sys/kernel/debug/kmemleak # cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800b9290308 (size 32): comm "bash", pid 1114, jiffies 4294848451 (age 141.139s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_trace+0x158/0x290 [] create_filter_start.constprop.28+0x99/0x940 [] create_filter+0xa9/0x160 [] create_event_filter+0xc/0x10 [] set_trigger_filter+0xe5/0x210 [] event_enable_trigger_func+0x324/0x490 [] event_trigger_write+0x1a2/0x260 [] __vfs_write+0xd7/0x380 [] vfs_write+0x101/0x260 [] SyS_write+0xab/0x130 [] entry_SYSCALL_64_fastpath+0x1f/0xbe [] 0xffffffffffffffff The function create_filter() is passed a 'filterp' pointer that gets allocated, and if "set_str" is true, it is up to the caller to free it, even on error. The problem is that the pointer is not freed by create_filter() when set_str is false. This is a bug, and it is not up to the caller to free the filter on error if it doesn't care about the string. Link: http://lkml.kernel.org/r/1502705898-27571-2-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 38b78eb85 ("tracing: Factorize filter creation") Reported-by: Chunyu Hu Tested-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 59a411ff60c7..181e139a8057 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call, if (err && set_str) append_filter_err(ps, filter); } + if (err && !set_str) { + free_event_filter(filter); + filter = NULL; + } create_filter_finish(ps); *filterp = filter; -- cgit v1.2.3 From 1c08c22c874ac88799cab1f78c40f46110274915 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 24 Aug 2017 12:04:29 -0400 Subject: cpuset: Fix incorrect memory_pressure control file mapping The memory_pressure control file was incorrectly set up without a private value (0, by default). As a result, this control file was treated like memory_migrate on read. By adding back the FILE_MEMORY_PRESSURE private value, the correct memory pressure value will be returned. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Fixes: 7dbdb199d3bf ("cgroup: replace cftype->mode with CFTYPE_WORLD_WRITABLE") Cc: stable@vger.kernel.org # v4.4+ --- kernel/cgroup/cpuset.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca8376e5008c..8362bac0d179 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1891,6 +1891,7 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, }, { -- cgit v1.2.3 From 3fd87127073292538047adf1c9c757e9cab0dd56 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 24 Aug 2017 14:38:51 -0700 Subject: strparser: initialize all callbacks commit bbb03029a899 ("strparser: Generalize strparser") added more function pointers to 'struct strp_callbacks'; however, kcm_attach() was not updated to initialize them. This could cause the ->lock() and/or ->unlock() function pointers to be set to garbage values, causing a crash in strp_work(). Fix the bug by moving the callback structs into static memory, so unspecified members are zeroed. Also constify them while we're at it. This bug was found by syzkaller, which encountered the following splat: IP: 0x55 PGD 3b1ca067 P4D 3b1ca067 PUD 3b12f067 PMD 0 Oops: 0010 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 2 PID: 1194 Comm: kworker/u8:1 Not tainted 4.13.0-rc4-next-20170811 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: kstrp strp_work task: ffff88006bb0e480 task.stack: ffff88006bb10000 RIP: 0010:0x55 RSP: 0018:ffff88006bb17540 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88006ce4bd60 RCX: 0000000000000000 RDX: 1ffff1000d9c97bd RSI: 0000000000000000 RDI: ffff88006ce4bc48 RBP: ffff88006bb17558 R08: ffffffff81467ab2 R09: 0000000000000000 R10: ffff88006bb17438 R11: ffff88006bb17940 R12: ffff88006ce4bc48 R13: ffff88003c683018 R14: ffff88006bb17980 R15: ffff88003c683000 FS: 0000000000000000(0000) GS:ffff88006de00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000055 CR3: 000000003c145000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2098 worker_thread+0x223/0x1860 kernel/workqueue.c:2233 kthread+0x35e/0x430 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431 Code: Bad RIP value. RIP: 0x55 RSP: ffff88006bb17540 CR2: 0000000000000055 ---[ end trace f0e4920047069cee ]--- Here is a C reproducer (requires CONFIG_BPF_SYSCALL=y and CONFIG_AF_KCM=y): #include #include #include #include #include #include #include #include static const struct bpf_insn bpf_insns[3] = { { .code = 0xb7 }, /* BPF_MOV64_IMM(0, 0) */ { .code = 0x95 }, /* BPF_EXIT_INSN() */ }; static const union bpf_attr bpf_attr = { .prog_type = 1, .insn_cnt = 2, .insns = (uintptr_t)&bpf_insns, .license = (uintptr_t)"", }; int main(void) { int bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &bpf_attr, sizeof(bpf_attr)); int inet_fd = socket(AF_INET, SOCK_STREAM, 0); int kcm_fd = socket(AF_KCM, SOCK_DGRAM, 0); ioctl(kcm_fd, SIOCKCMATTACH, &(struct kcm_attach) { .fd = inet_fd, .bpf_fd = bpf_fd }); } Fixes: bbb03029a899 ("strparser: Generalize strparser") Cc: Dmitry Vyukov Cc: Tom Herbert Signed-off-by: Eric Biggers Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 78b2bb9370ac..617c239590c2 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -368,12 +368,12 @@ static int smap_read_sock_done(struct strparser *strp, int err) static int smap_init_sock(struct smap_psock *psock, struct sock *sk) { - struct strp_callbacks cb; + static const struct strp_callbacks cb = { + .rcv_msg = smap_read_sock_strparser, + .parse_msg = smap_parse_func_strparser, + .read_sock_done = smap_read_sock_done, + }; - memset(&cb, 0, sizeof(cb)); - cb.rcv_msg = smap_read_sock_strparser; - cb.parse_msg = smap_parse_func_strparser; - cb.read_sock_done = smap_read_sock_done; return strp_init(&psock->strp, sk, &cb); } -- cgit v1.2.3 From 64aee2a965cf2954a038b5522f11d2cd2f0f8f3e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 22 Jun 2017 15:41:38 +0100 Subject: perf/core: Fix group {cpu,task} validation Regardless of which events form a group, it does not make sense for the events to target different tasks and/or CPUs, as this leaves the group inconsistent and impossible to schedule. The core perf code assumes that these are consistent across (successfully intialised) groups. Core perf code only verifies this when moving SW events into a HW context. Thus, we can violate this requirement for pure SW groups and pure HW groups, unless the relevant PMU driver happens to perform this verification itself. These mismatched groups subsequently wreak havoc elsewhere. For example, we handle watchpoints as SW events, and reserve watchpoint HW on a per-CPU basis at pmu::event_init() time to ensure that any event that is initialised is guaranteed to have a slot at pmu::add() time. However, the core code only checks the group leader's cpu filter (via event_filter_match()), and can thus install follower events onto CPUs violating thier (mismatched) CPU filters, potentially installing them into a CPU without sufficient reserved slots. This can be triggered with the below test case, resulting in warnings from arch backends. #define _GNU_SOURCE #include #include #include #include #include #include #include static int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); } char watched_char; struct perf_event_attr wp_attr = { .type = PERF_TYPE_BREAKPOINT, .bp_type = HW_BREAKPOINT_RW, .bp_addr = (unsigned long)&watched_char, .bp_len = 1, .size = sizeof(wp_attr), }; int main(int argc, char *argv[]) { int leader, ret; cpu_set_t cpus; /* * Force use of CPU0 to ensure our CPU0-bound events get scheduled. */ CPU_ZERO(&cpus); CPU_SET(0, &cpus); ret = sched_setaffinity(0, sizeof(cpus), &cpus); if (ret) { printf("Unable to set cpu affinity\n"); return 1; } /* open leader event, bound to this task, CPU0 only */ leader = perf_event_open(&wp_attr, 0, 0, -1, 0); if (leader < 0) { printf("Couldn't open leader: %d\n", leader); return 1; } /* * Open a follower event that is bound to the same task, but a * different CPU. This means that the group should never be possible to * schedule. */ ret = perf_event_open(&wp_attr, 0, 1, leader, 0); if (ret < 0) { printf("Couldn't open mismatched follower: %d\n", ret); return 1; } else { printf("Opened leader/follower with mismastched CPUs\n"); } /* * Open as many independent events as we can, all bound to the same * task, CPU0 only. */ do { ret = perf_event_open(&wp_attr, 0, 0, -1, 0); } while (ret >= 0); /* * Force enable/disble all events to trigger the erronoeous * installation of the follower event. */ printf("Opened all events. Toggling..\n"); for (;;) { prctl(PR_TASK_PERF_EVENTS_DISABLE, 0, 0, 0, 0); prctl(PR_TASK_PERF_EVENTS_ENABLE, 0, 0, 0, 0); } return 0; } Fix this by validating this requirement regardless of whether we're moving events. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zhou Chengming Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1498142498-15758-1-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ee20d4c546b5..3504125871d2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10032,28 +10032,27 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_context; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same task, or both + * per-CPU events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Do not allow to attach to a group in a different task + * or CPU context. If we're moving SW events, we'll fix + * this up later, so allow that. + */ + if (!move_group && group_leader->ctx != ctx) + goto err_context; /* * Only a group leader can be exclusive or pinned -- cgit v1.2.3 From 2ab346cfb0decf01523949e29f5cf542f2304611 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 16 Aug 2017 17:18:16 +0100 Subject: perf/aux: Make aux_{head,wakeup} ring_buffer members long The aux_head and aux_wakeup members of struct ring_buffer are defined using the local_t type, despite the fact that they are only accessed via the perf_aux_output_*() functions, which cannot race with each other for a given ring buffer. This patch changes the type of the members to long, so we can avoid using the local_*() API where it isn't needed. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1502900297-21839-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 4 ++-- kernel/events/ring_buffer.c | 31 ++++++++++++++----------------- 2 files changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 486fd78eb8d5..2941b868353c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -38,9 +38,9 @@ struct ring_buffer { struct user_struct *mmap_user; /* AUX area */ - local_t aux_head; + long aux_head; local_t aux_nest; - local_t aux_wakeup; + long aux_wakeup; unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ee97196bb151..25437fda56e3 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) goto err_put; - aux_head = local_read(&rb->aux_head); + aux_head = rb->aux_head; handle->rb = rb; handle->event = event; @@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, */ if (!rb->aux_overwrite) { aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); - handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; + handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); @@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; aux_head = handle->head; - local_set(&rb->aux_head, aux_head); + rb->aux_head = aux_head; } else { handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; - aux_head = local_read(&rb->aux_head); - local_add(size, &rb->aux_head); + aux_head = rb->aux_head; + rb->aux_head += size; } if (size || handle->aux_flags) { @@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->aux_flags); } - aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - - if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + rb->user_page->aux_head = rb->aux_head; + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { wakeup = true; - local_add(rb->aux_watermark, &rb->aux_wakeup); + rb->aux_wakeup += rb->aux_watermark; } if (wakeup) { @@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { struct ring_buffer *rb = handle->rb; - unsigned long aux_head; if (size > handle->size) return -ENOSPC; - local_add(size, &rb->aux_head); + rb->aux_head += size; - aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + rb->user_page->aux_head = rb->aux_head; + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { perf_output_wakeup(handle); - local_add(rb->aux_watermark, &rb->aux_wakeup); - handle->wakeup = local_read(&rb->aux_wakeup) + - rb->aux_watermark; + rb->aux_wakeup += rb->aux_watermark; + handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } - handle->head = aux_head; + handle->head = rb->aux_head; handle->size -= size; return 0; -- cgit v1.2.3 From d9a50b0256f06bd39a1bed1ba40baec37c356b11 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 16 Aug 2017 17:18:17 +0100 Subject: perf/aux: Ensure aux_wakeup represents most recent wakeup index The aux_watermark member of struct ring_buffer represents the period (in terms of bytes) at which wakeup events should be generated when data is written to the aux buffer in non-snapshot mode. On hardware that cannot generate an interrupt when the aux_head reaches an arbitrary wakeup index (such as ARM SPE), the aux_head sampled from handle->head in perf_aux_output_{skip,end} may in fact be past the wakeup index. This can lead to wakeup slowly falling behind the head. For example, consider the case where hardware can only generate an interrupt on a page-boundary and the aux buffer is initialised as follows: // Buffer size is 2 * PAGE_SIZE rb->aux_head = rb->aux_wakeup = 0 rb->aux_watermark = PAGE_SIZE / 2 following the first perf_aux_output_begin call, the handle is initialised with: handle->head = 0 handle->size = 2 * PAGE_SIZE handle->wakeup = PAGE_SIZE / 2 and the hardware will be programmed to generate an interrupt at PAGE_SIZE. When the interrupt is raised, the hardware head will be at PAGE_SIZE, so calling perf_aux_output_end(handle, PAGE_SIZE) puts the ring buffer into the following state: rb->aux_head = PAGE_SIZE rb->aux_wakeup = PAGE_SIZE / 2 rb->aux_watermark = PAGE_SIZE / 2 and then the next call to perf_aux_output_begin will result in: handle->head = handle->wakeup = PAGE_SIZE for which the semantics are unclear and, for a smaller aux_watermark (e.g. PAGE_SIZE / 4), then the wakeup would in fact be behind head at this point. This patch fixes the problem by rounding down the aux_head (as sampled from the handle) to the nearest aux_watermark boundary when updating rb->aux_wakeup, therefore taking into account any overruns by the hardware. Reported-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1502900297-21839-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 2 +- kernel/events/ring_buffer.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2941b868353c..5377c591c57a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -40,7 +40,7 @@ struct ring_buffer { /* AUX area */ long aux_head; local_t aux_nest; - long aux_wakeup; + long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 25437fda56e3..af71a84e12ee 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -453,7 +453,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) rb->user_page->aux_head = rb->aux_head; if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { wakeup = true; - rb->aux_wakeup += rb->aux_watermark; + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); } if (wakeup) { @@ -486,7 +486,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->user_page->aux_head = rb->aux_head; if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { perf_output_wakeup(handle); - rb->aux_wakeup += rb->aux_watermark; + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } -- cgit v1.2.3 From 1d953111b648e48923171c3c9cf17be2250544fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Aug 2017 17:59:28 +0200 Subject: perf/core: Don't report zero PIDs for exiting tasks The exiting/dead task has no PIDs and in this case perf_event_pid/tid() return zero, change them to return -1 to distinguish this case from idle threads. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170822155928.GA6892@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1ac5015bab04..b411321b6c26 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx) return parent_ctx; } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, + enum pid_type type) { + u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; - return task_tgid_nr_ns(p, event->ns); + nr = __task_pid_nr_ns(p, type, event->ns); + /* avoid -1 if it is idle thread or runs in another ns */ + if (!nr && !pid_alive(p)) + nr = -1; + return nr; } -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; + return perf_event_pid_type(event, p, __PIDTYPE_TGID); +} - return task_pid_nr_ns(p, event->ns); +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + return perf_event_pid_type(event, p, PIDTYPE_PID); } /* -- cgit v1.2.3 From d0618410eced4eb092295fad10312a4545fcdfaf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 22 Aug 2017 19:22:43 +0200 Subject: tracing, perf: Adjust code layout in get_recursion_context() In an XDP redirect applications using tracepoint xdp:xdp_redirect to diagnose TX overrun, I noticed perf_swevent_get_recursion_context() was consuming 2% CPU. This was reduced to 1.85% with this simple change. Looking at the annotated asm code, it was clear that the unlikely case in_nmi() test was chosen (by the compiler) as the most likely event/branch. This small adjustment makes the compiler (GCC version 7.1.1 20170622 (Red Hat 7.1.1-3)) put in_nmi() as an unlikely branch. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150342256382.16595.986861478681783732.stgit@firesoul Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5377c591c57a..843e97047335 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion) { int rctx; - if (in_nmi()) + if (unlikely(in_nmi())) rctx = 3; else if (in_irq()) rctx = 2; -- cgit v1.2.3 From a1d14934ea4b9db816a8dbfeab1c3e7204a0d871 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Aug 2017 12:52:32 +0200 Subject: workqueue/lockdep: 'Fix' flush_work() annotation The flush_work() annotation as introduced by commit: e159489baa71 ("workqueue: relax lockdep annotation on flush_work()") hits on the lockdep problem with recursive read locks. The situation as described is: Work W1: Work W2: Task: ARR(Q) ARR(Q) flush_workqueue(Q) A(W1) A(W2) A(Q) flush_work(W2) R(Q) A(W2) R(W2) if (special) A(Q) else ARR(Q) R(Q) where: A - acquire, ARR - acquire-read-recursive, R - release. Where under 'special' conditions we want to trigger a lock recursion deadlock, but otherwise allow the flush_work(). The allowing is done by using recursive read locks (ARR), but lockdep is broken for recursive stuff. However, there appears to be no need to acquire the lock if we're not 'special', so if we remove the 'else' clause things become much simpler and no longer need the recursion thing at all. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boqun.feng@gmail.com Cc: byungchul.park@lge.com Cc: david@fromorbit.com Cc: johannes@sipsolutions.net Cc: oleg@redhat.com Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f128b3becfe1..8ad214dc15a9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2091,7 +2091,7 @@ __acquires(&pool->lock) spin_unlock_irq(&pool->lock); - lock_map_acquire_read(&pwq->wq->lockdep_map); + lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); crossrelease_hist_start(XHLOCK_PROC); trace_workqueue_execute_start(work); @@ -2826,16 +2826,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) spin_unlock_irq(&pool->lock); /* - * If @max_active is 1 or rescuer is in use, flushing another work - * item on the same workqueue may lead to deadlock. Make sure the - * flusher is not running on the same workqueue by verifying write - * access. + * Force a lock recursion deadlock when using flush_work() inside a + * single-threaded or rescuer equipped workqueue. + * + * For single threaded workqueues the deadlock happens when the work + * is after the work issuing the flush_work(). For rescuer equipped + * workqueues the deadlock happens when the rescuer stalls, blocking + * forward progress. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) + if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) { lock_map_acquire(&pwq->wq->lockdep_map); - else - lock_map_acquire_read(&pwq->wq->lockdep_map); - lock_map_release(&pwq->wq->lockdep_map); + lock_map_release(&pwq->wq->lockdep_map); + } return true; already_gone: -- cgit v1.2.3 From e6f3faa734a00c606b7b06c6b9f15e5627d3245b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Aug 2017 13:23:30 +0200 Subject: locking/lockdep: Fix workqueue crossrelease annotation The new completion/crossrelease annotations interact unfavourable with the extant flush_work()/flush_workqueue() annotations. The problem is that when a single work class does: wait_for_completion(&C) and complete(&C) in different executions, we'll build dependencies like: lock_map_acquire(W) complete_acquire(C) and lock_map_acquire(W) complete_release(C) which results in the dependency chain: W->C->W, which lockdep thinks spells deadlock, even though there is no deadlock potential since works are ran concurrently. One possibility would be to change the work 'lock' to recursive-read, but that would mean hitting a lockdep limitation on recursive locks. Also, unconditinoally switching to recursive-read here would fail to detect the actual deadlock on single-threaded workqueues, which do have a problem with this. For now, forcefully disregard these locks for crossrelease. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boqun.feng@gmail.com Cc: byungchul.park@lge.com Cc: david@fromorbit.com Cc: johannes@sipsolutions.net Cc: oleg@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 56 +++++++++++++++++++++++++++++++----------------- kernel/workqueue.c | 23 +++++++++++++++++++- 2 files changed, 58 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 66011c9f5df3..f73ca595b81e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4629,7 +4629,7 @@ asmlinkage __visible void lockdep_sys_exit(void) * the index to point to the last entry, which is already invalid. */ crossrelease_hist_end(XHLOCK_PROC); - crossrelease_hist_start(XHLOCK_PROC); + crossrelease_hist_start(XHLOCK_PROC, false); } void lockdep_rcu_suspicious(const char *file, const int line, const char *s) @@ -4725,25 +4725,25 @@ static inline void invalidate_xhlock(struct hist_lock *xhlock) /* * Lock history stacks; we have 3 nested lock history stacks: * - * Hard IRQ - * Soft IRQ - * History / Task + * HARD(IRQ) + * SOFT(IRQ) + * PROC(ess) * - * The thing is that once we complete a (Hard/Soft) IRQ the future task locks - * should not depend on any of the locks observed while running the IRQ. + * The thing is that once we complete a HARD/SOFT IRQ the future task locks + * should not depend on any of the locks observed while running the IRQ. So + * what we do is rewind the history buffer and erase all our knowledge of that + * temporal event. * - * So what we do is rewind the history buffer and erase all our knowledge of - * that temporal event. - */ - -/* - * We need this to annotate lock history boundaries. Take for instance - * workqueues; each work is independent of the last. The completion of a future - * work does not depend on the completion of a past work (in general). - * Therefore we must not carry that (lock) dependency across works. + * The PROCess one is special though; it is used to annotate independence + * inside a task. + * + * Take for instance workqueues; each work is independent of the last. The + * completion of a future work does not depend on the completion of a past work + * (in general). Therefore we must not carry that (lock) dependency across + * works. * * This is true for many things; pretty much all kthreads fall into this - * pattern, where they have an 'idle' state and future completions do not + * pattern, where they have an invariant state and future completions do not * depend on past completions. Its just that since they all have the 'same' * form -- the kthread does the same over and over -- it doesn't typically * matter. @@ -4751,15 +4751,31 @@ static inline void invalidate_xhlock(struct hist_lock *xhlock) * The same is true for system-calls, once a system call is completed (we've * returned to userspace) the next system call does not depend on the lock * history of the previous system call. + * + * They key property for independence, this invariant state, is that it must be + * a point where we hold no locks and have no history. Because if we were to + * hold locks, the restore at _end() would not necessarily recover it's history + * entry. Similarly, independence per-definition means it does not depend on + * prior state. */ -void crossrelease_hist_start(enum xhlock_context_t c) +void crossrelease_hist_start(enum xhlock_context_t c, bool force) { struct task_struct *cur = current; - if (cur->xhlocks) { - cur->xhlock_idx_hist[c] = cur->xhlock_idx; - cur->hist_id_save[c] = cur->hist_id; + if (!cur->xhlocks) + return; + + /* + * We call this at an invariant point, no current state, no history. + */ + if (c == XHLOCK_PROC) { + /* verified the former, ensure the latter */ + WARN_ON_ONCE(!force && cur->lockdep_depth); + invalidate_xhlock(&xhlock(cur->xhlock_idx)); } + + cur->xhlock_idx_hist[c] = cur->xhlock_idx; + cur->hist_id_save[c] = cur->hist_id; } void crossrelease_hist_end(enum xhlock_context_t c) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8ad214dc15a9..c0331891dec1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2093,7 +2093,28 @@ __acquires(&pool->lock) lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); - crossrelease_hist_start(XHLOCK_PROC); + /* + * Strictly speaking we should do start(PROC) without holding any + * locks, that is, before these two lock_map_acquire()'s. + * + * However, that would result in: + * + * A(W1) + * WFC(C) + * A(W1) + * C(C) + * + * Which would create W1->C->W1 dependencies, even though there is no + * actual deadlock possible. There are two solutions, using a + * read-recursive acquire on the work(queue) 'locks', but this will then + * hit the lockdep limitation on recursive locks, or simly discard + * these locks. + * + * AFAICT there is no possible deadlock scenario between the + * flush_work() and complete() primitives (except for single-threaded + * workqueues), so hiding them isn't a problem. + */ + crossrelease_hist_start(XHLOCK_PROC, true); trace_workqueue_execute_start(work); worker->current_func(work); /* -- cgit v1.2.3 From 213c5a459ae0a7ef0a092f576aae2d5db6819360 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Thu, 10 Aug 2017 15:52:16 +0800 Subject: sched/topology: Fix memory leak in __sdt_alloc() Found this issue by kmemleak: the 'sg' and 'sgc' pointers from __sdt_alloc() might be leaked as each domain holds many groups' ref, but in destroy_sched_domain(), it only declined the first group ref. Onlining and offlining a CPU can trigger this leak, and cause OOM. Reproducer for my 6 CPUs machine: while true do echo 0 > /sys/devices/system/cpu/cpu5/online; echo 1 > /sys/devices/system/cpu/cpu5/online; done unreferenced object 0xffff88007d772a80 (size 64): comm "cpuhp/5", pid 39, jiffies 4294719962 (age 35.251s) hex dump (first 32 bytes): c0 22 77 7d 00 88 ff ff 02 00 00 00 01 00 00 00 ."w}............ 40 2a 77 7d 00 88 ff ff 00 00 00 00 00 00 00 00 @*w}............ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc_node+0xf1/0x280 [] build_sched_domains+0x1e8/0xf20 [] partition_sched_domains+0x304/0x360 [] cpuset_update_active_cpus+0x17/0x40 [] sched_cpu_activate+0xae/0xc0 [] cpuhp_invoke_callback+0x90/0x400 [] cpuhp_up_callbacks+0x37/0xb0 [] cpuhp_thread_fun+0xd7/0xf0 [] smpboot_thread_fn+0x110/0x160 [] kthread+0x109/0x140 [] ret_from_fork+0x25/0x30 [] 0xffffffffffffffff unreferenced object 0xffff88007d772a40 (size 64): comm "cpuhp/5", pid 39, jiffies 4294719962 (age 35.251s) hex dump (first 32 bytes): 03 00 00 00 00 00 00 00 00 04 00 00 00 00 00 00 ................ 00 04 00 00 00 00 00 00 4f 3c fc ff 00 00 00 00 ........O<...... backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc_node+0xf1/0x280 [] build_sched_domains+0xead/0xf20 [] partition_sched_domains+0x304/0x360 [] cpuset_update_active_cpus+0x17/0x40 [] sched_cpu_activate+0xae/0xc0 [] cpuhp_invoke_callback+0x90/0x400 [] cpuhp_up_callbacks+0x37/0xb0 [] cpuhp_thread_fun+0xd7/0xf0 [] smpboot_thread_fn+0x110/0x160 [] kthread+0x109/0x140 [] ret_from_fork+0x25/0x30 [] 0xffffffffffffffff Reported-by: Chunyu Hu Signed-off-by: Shu Wang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Chunyu Hu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: liwang@redhat.com Link: http://lkml.kernel.org/r/1502351536-9108-1-git-send-email-shuwang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index bd8b6d6f5387..4197f1346153 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -335,7 +335,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) kfree(sg->sgc); - kfree(sg); + if (atomic_dec_and_test(&sg->ref)) + kfree(sg); sg = tmp; } while (sg != first); } @@ -343,15 +344,11 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) static void destroy_sched_domain(struct sched_domain *sd) { /* - * If its an overlapping domain it has private groups, iterate and - * nuke them all. + * A sched domain has many groups' reference, and an overlapping + * domain has private groups, iterate and nuke them all. */ - if (sd->flags & SD_OVERLAP) { - free_sched_groups(sd->groups, 1); - } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgc); - kfree(sd->groups); - } + free_sched_groups(sd->groups, 1); + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) kfree(sd->shared); kfree(sd); @@ -668,6 +665,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) else cpumask_copy(sg_span, sched_domain_span(sd)); + atomic_inc(&sg->ref); return sg; } -- cgit v1.2.3 From a090c4f2cd2cfeb54eaf8ad1e726a6c485f3abc3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Aug 2017 15:42:52 +0200 Subject: sched/topology: Improve comments Mike provided a better comment for destroy_sched_domain() ... Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4197f1346153..179b90b60ec6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -344,8 +344,9 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) static void destroy_sched_domain(struct sched_domain *sd) { /* - * A sched domain has many groups' reference, and an overlapping - * domain has private groups, iterate and nuke them all. + * A normal sched domain may have multiple group references, an + * overlapping domain, having private groups, only one. Iterate, + * dropping group/capacity references, freeing where none remain. */ free_sched_groups(sd->groups, 1); -- cgit v1.2.3 From 77d1dfda0e79b41894880418f04794e92e4350e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Aug 2017 12:16:24 +0200 Subject: sched/topology, cpuset: Avoid spurious/wrong domain rebuilds When disabling cpuset.sched_load_balance we expect to be able to online CPUs without generating sched_domains. However this is currently completely broken. What happens is that we generate the sched_domains and then destroy them. This is because of the spurious 'default' domain build in cpuset_update_active_cpus(). That builds a single machine wide domain and then schedules a work to build the 'real' domains. The work then finds there are _no_ domains and destroys the lot again. Furthermore, if there actually were cpusets, building the machine wide domain is actively wrong, because it would allow tasks to 'escape' their cpuset. Also I don't think its needed, the scheduler really should respect the active mask. Reported-by: Ofer Levi(SW) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vineet.Gupta1@synopsys.com Cc: rusty@rustcorp.com.au Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8d5151688504..1d2369451939 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2343,13 +2343,7 @@ void cpuset_update_active_cpus(void) * We're inside cpu hotplug critical region which usually nests * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. - * - * We still need to do partition_sched_domains() synchronously; - * otherwise, the scheduler will get confused and put tasks to the - * dead CPU. Fall back to the default single domain. - * cpuset_hotplug_workfn() will rebuild it as necessary. */ - partition_sched_domains(1, NULL, NULL); schedule_work(&cpuset_hotplug_work); } -- cgit v1.2.3 From 09e0dd8e0f2e197690d34fed8cb4737114d3dd5f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Aug 2017 12:16:24 +0200 Subject: sched/topology: Avoid pointless rebuild Fix partition_sched_domains() to try and preserve the existing machine wide domain instead of unconditionally destroying it. We do this by attempting to allocate the new single domain, only when that fails to we reuse the fallback_doms. When using fallback_doms we need to first destroy and then recreate because both the old and new could be backed by it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Ofer Levi(SW) Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vineet.Gupta1@synopsys.com Cc: rusty@rustcorp.com.au Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 179b90b60ec6..727daa2a0abe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1851,7 +1851,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], /* Let the architecture update CPU core mappings: */ new_topology = arch_update_cpu_topology(); - n = doms_new ? ndoms_new : 0; + if (!doms_new) { + WARN_ON_ONCE(dattr_new); + n = 0; + doms_new = alloc_sched_domains(1); + if (doms_new) { + n = 1; + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + } + } else { + n = ndoms_new; + } /* Destroy deleted domains: */ for (i = 0; i < ndoms_cur; i++) { @@ -1867,11 +1877,10 @@ match1: } n = ndoms_cur; - if (doms_new == NULL) { + if (!doms_new) { n = 0; doms_new = &fallback_doms; cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); - WARN_ON_ONCE(dattr_new); } /* Build new domains: */ -- cgit v1.2.3 From bbdacdfed2f5fa50a2cc9f500a36e05990a0837d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Aug 2017 17:10:26 +0200 Subject: sched/debug: Optimize sched_domain sysctl generation Currently we unconditionally destroy all sysctl bits and regenerate them after we've rebuild the domains (even if that rebuild is a no-op). And since we unconditionally (re)build the sysctl for all possible CPUs, onlining all CPUs gets us O(n^2) time. Instead change this to only rebuild the bits for CPUs we've actually installed new domains on. Reported-by: Ofer Levi(SW) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 68 +++++++++++++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 4 +++ kernel/sched/topology.c | 1 + 3 files changed, 59 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index cfd84f79e075..4a23bbc3111b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) return table; } +static cpumask_var_t sd_sysctl_cpus; static struct ctl_table_header *sd_sysctl_header; + void register_sched_domain_sysctl(void) { - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + static struct ctl_table *cpu_entries; + static struct ctl_table **cpu_idx; char buf[32]; + int i; - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; + if (!cpu_entries) { + cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); + if (!cpu_entries) + return; - if (entry == NULL) - return; + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = cpu_entries; + } - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; + if (!cpu_idx) { + struct ctl_table *e = cpu_entries; + + cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); + if (!cpu_idx) + return; + + /* deal with sparse possible map */ + for_each_possible_cpu(i) { + cpu_idx[i] = e; + e++; + } + } + + if (!cpumask_available(sd_sysctl_cpus)) { + if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) + return; + + /* init to possible to not have holes in @cpu_entries */ + cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); + } + + for_each_cpu(i, sd_sysctl_cpus) { + struct ctl_table *e = cpu_idx[i]; + + if (e->child) + sd_free_ctl_entry(&e->child); + + if (!e->procname) { + snprintf(buf, 32, "cpu%d", i); + e->procname = kstrdup(buf, GFP_KERNEL); + } + e->mode = 0555; + e->child = sd_alloc_ctl_cpu_table(i); + + __cpumask_clear_cpu(i, sd_sysctl_cpus); } WARN_ON(sd_sysctl_header); sd_sysctl_header = register_sysctl_table(sd_ctl_root); } +void dirty_sched_domain_sysctl(int cpu) +{ + if (cpumask_available(sd_sysctl_cpus)) + __cpumask_set_cpu(cpu, sd_sysctl_cpus); +} + /* may be called multiple times per register */ void unregister_sched_domain_sysctl(void) { unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); } #endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SMP */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeef1a3086d1..25e5cb1107f3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg); #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) void register_sched_domain_sysctl(void); +void dirty_sched_domain_sysctl(int cpu); void unregister_sched_domain_sysctl(void); #else static inline void register_sched_domain_sysctl(void) { } +static inline void dirty_sched_domain_sysctl(int cpu) +{ +} static inline void unregister_sched_domain_sysctl(void) { } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 727daa2a0abe..6f7b43982f73 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -459,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) rq_attach_root(rq, rd); tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); + dirty_sched_domain_sysctl(cpu); destroy_sched_domains(tmp); update_top_cache_domain(cpu); -- cgit v1.2.3 From ce8bdd6957202a38d67038e5ec940eed50f9f3eb Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Tue, 22 Aug 2017 15:50:53 +0800 Subject: genirq: Fix semicolon.cocci warnings kernel/irq/proc.c:69:2-3: Unneeded semicolon Remove unneeded semicolon. Generated by: scripts/coccinelle/misc/semicolon.cocci Fixes: 0d3f54257dc3 ("genirq: Introduce effective affinity mask") Signed-off-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: kbuild-all@01.org Link: http://lkml.kernel.org/r/20170822075053.GA93890@lkp-hsx02 --- kernel/irq/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 0534781724d0..9f62f9a1a5e9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -66,7 +66,7 @@ static int show_irq_affinity(int type, struct seq_file *m) #else return -EINVAL; #endif - }; + } switch (type) { case AFFINITY_LIST: -- cgit v1.2.3 From 20c4d49c0f304f3f945bbd560b26afa98f75a0c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 25 Aug 2017 15:14:09 +0300 Subject: irqdomain: Prevent potential NULL pointer dereference in irq_domain_push_irq() This code generates a Smatch warning: kernel/irq/irqdomain.c:1511 irq_domain_push_irq() warn: variable dereferenced before check 'root_irq_data' (see line 1508) irq_get_irq_data() can return a NULL pointer, but the code dereferences the returned pointer before checking it. Move the NULL pointer check before the dereference. [ tglx: Rewrote changelog to be precise and conforming to the instructions in submitting-patches and added a Fixes tag. Sigh! ] Fixes: 495c38d3001f ("irqdomain: Add irq_domain_{push,pop}_irq() functions") Signed-off-by: Dan Carpenter Signed-off-by: Thomas Gleixner Acked-by: David Daney Cc: Marc Zyngier Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20170825121409.6rfv4vt6ztz2oqkt@mwanda --- kernel/irq/irqdomain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1ff9912211e9..d62351714f3e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1504,10 +1504,10 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) if (WARN_ON(!irq_domain_is_hierarchy(domain))) return -EINVAL; - if (domain->parent != root_irq_data->domain) + if (!root_irq_data) return -EINVAL; - if (!root_irq_data) + if (domain->parent != root_irq_data->domain) return -EINVAL; child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL, -- cgit v1.2.3 From b33394ba5c0974a578c24b2fecbb91a984da5e09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Aug 2017 22:34:05 +0200 Subject: genirq/proc: Avoid uninitalized variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/irq/proc.c: In function ‘show_irq_affinity’: include/linux/cpumask.h:24:29: warning: ‘mask’ may be used uninitialized in this function [-Wmaybe-uninitialized] #define cpumask_bits(maskp) ((maskp)->bits) gcc is silly, but admittedly it can't know that this won't be called with anything else than the enumerated constants. Shut up the warning by creating a default clause. Fixes: 6bc6d4abd22e ("genirq/proc: Use the the accessor to report the effective affinity Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9f62f9a1a5e9..6376b4a598d3 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -63,9 +63,9 @@ static int show_irq_affinity(int type, struct seq_file *m) #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK mask = irq_data_get_effective_affinity_mask(&desc->irq_data); break; -#else - return -EINVAL; #endif + default: + return -EINVAL; } switch (type) { -- cgit v1.2.3 From 30d6e0a4190d37740e9447e4e4815f06992dd8c3 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 24 Aug 2017 09:31:05 +0200 Subject: futex: Remove duplicated code and fix undefined behaviour There is code duplicated over all architecture's headers for futex_atomic_op_inuser. Namely op decoding, access_ok check for uaddr, and comparison of the result. Remove this duplication and leave up to the arches only the needed assembly which is now in arch_futex_atomic_op_inuser. This effectively distributes the Will Deacon's arm64 fix for undefined behaviour reported by UBSAN to all architectures. The fix was done in commit 5f16a046f8e1 (arm64: futex: Fix undefined behaviour with FUTEX_OP_OPARG_SHIFT usage). Look there for an example dump. And as suggested by Thomas, check for negative oparg too, because it was also reported to cause undefined behaviour report. Note that s390 removed access_ok check in d12a29703 ("s390/uaccess: remove pointless access_ok() checks") as access_ok there returns true. We introduce it back to the helper for the sake of simplicity (it gets optimized away anyway). Signed-off-by: Jiri Slaby Signed-off-by: Thomas Gleixner Acked-by: Russell King Acked-by: Michael Ellerman (powerpc) Acked-by: Heiko Carstens [s390] Acked-by: Chris Metcalf [for tile] Reviewed-by: Darren Hart (VMware) Reviewed-by: Will Deacon [core/arm64] Cc: linux-mips@linux-mips.org Cc: Rich Felker Cc: linux-ia64@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: peterz@infradead.org Cc: Benjamin Herrenschmidt Cc: Max Filippov Cc: Paul Mackerras Cc: sparclinux@vger.kernel.org Cc: Jonas Bonn Cc: linux-s390@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: Yoshinori Sato Cc: linux-hexagon@vger.kernel.org Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Catalin Marinas Cc: Matt Turner Cc: linux-snps-arc@lists.infradead.org Cc: Fenghua Yu Cc: Arnd Bergmann Cc: linux-xtensa@linux-xtensa.org Cc: Stefan Kristiansson Cc: openrisc@lists.librecores.org Cc: Ivan Kokshaysky Cc: Stafford Horne Cc: linux-arm-kernel@lists.infradead.org Cc: Richard Henderson Cc: Chris Zankel Cc: Michal Simek Cc: Tony Luck Cc: linux-parisc@vger.kernel.org Cc: Vineet Gupta Cc: Ralf Baechle Cc: Richard Kuo Cc: linux-alpha@vger.kernel.org Cc: Martin Schwidefsky Cc: linuxppc-dev@lists.ozlabs.org Cc: "David S. Miller" Link: http://lkml.kernel.org/r/20170824073105.3901-1-jslaby@suse.cz --- kernel/futex.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0939255fc750..3d38eaf05492 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1551,6 +1551,45 @@ out: return ret; } +static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +{ + unsigned int op = (encoded_op & 0x70000000) >> 28; + unsigned int cmp = (encoded_op & 0x0f000000) >> 24; + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); + int cmparg = sign_extend32(encoded_op & 0x00000fff, 12); + int oldval, ret; + + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { + if (oparg < 0 || oparg > 31) + return -EINVAL; + oparg = 1 << oparg; + } + + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + return -EFAULT; + + ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); + if (ret) + return ret; + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + return oldval == cmparg; + case FUTEX_OP_CMP_NE: + return oldval != cmparg; + case FUTEX_OP_CMP_LT: + return oldval < cmparg; + case FUTEX_OP_CMP_GE: + return oldval >= cmparg; + case FUTEX_OP_CMP_LE: + return oldval <= cmparg; + case FUTEX_OP_CMP_GT: + return oldval > cmparg; + default: + return -ENOSYS; + } +} + /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: -- cgit v1.2.3 From 2b7e8665b4ff51c034c55df3cff76518d1a9ee3a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 25 Aug 2017 15:55:43 -0700 Subject: fork: fix incorrect fput of ->exe_file causing use-after-free Commit 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") made it possible to kill a forking task while it is waiting to acquire its ->mmap_sem for write, in dup_mmap(). However, it was overlooked that this introduced an new error path before a reference is taken on the mm_struct's ->exe_file. Since the ->exe_file of the new mm_struct was already set to the old ->exe_file by the memcpy() in dup_mm(), it was possible for the mmput() in the error path of dup_mm() to drop a reference to ->exe_file which was never taken. This caused the struct file to later be freed prematurely. Fix it by updating mm_init() to NULL out the ->exe_file, in the same place it clears other things like the list of mmaps. This bug was found by syzkaller. It can be reproduced using the following C program: #define _GNU_SOURCE #include #include #include #include #include #include static void *mmap_thread(void *_arg) { for (;;) { mmap(NULL, 0x1000000, PROT_READ, MAP_POPULATE|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); } } static void *fork_thread(void *_arg) { usleep(rand() % 10000); fork(); } int main(void) { fork(); fork(); fork(); for (;;) { if (fork() == 0) { pthread_t t; pthread_create(&t, NULL, mmap_thread, NULL); pthread_create(&t, NULL, fork_thread, NULL); usleep(rand() % 10000); syscall(__NR_exit_group, 0); } wait(NULL); } } No special kernel config options are needed. It usually causes a NULL pointer dereference in __remove_shared_vm_struct() during exit, or in dup_mmap() (which is usually inlined into copy_process()) during fork. Both are due to a vm_area_struct's ->vm_file being used after it's already been freed. Google Bug Id: 64772007 Link: http://lkml.kernel.org/r/20170823211408.31198-1-ebiggers3@gmail.com Fixes: 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") Signed-off-by: Eric Biggers Tested-by: Mark Rutland Acked-by: Michal Hocko Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Konstantin Khlebnikov Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: [v4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e075b7780421..cbbea277b3fb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -806,6 +806,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS -- cgit v1.2.3 From 0bcdc0987cce9880436b70836c6a92bb8e744fd1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 25 Aug 2017 15:57:04 -0700 Subject: time: Fix ktime_get_raw() incorrect base accumulation In comqit fc6eead7c1e2 ("time: Clean up CLOCK_MONOTONIC_RAW time handling"), the following code got mistakenly added to the update of the raw timekeeper: /* Update the monotonic raw base */ seconds = tk->raw_sec; nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); Which adds the raw_sec value and the shifted down raw xtime_nsec to the base value. But the read function adds the shifted down tk->tkr_raw.xtime_nsec value another time, The result of this is that ktime_get_raw() users (which are all internal users) see the raw time move faster then it should (the rate at which can vary with the current size of tkr_raw.xtime_nsec), which has resulted in at least problems with graphics rendering performance. The change tried to match the monotonic base update logic: seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); Which adds the wall_to_monotonic.tv_nsec value, but not the tk->tkr_mono.xtime_nsec value to the base. To fix this, simplify the tkr_raw.base accumulation to only accumulate the raw_sec portion, and do not include the tkr_raw.xtime_nsec portion, which will be added at read time. Fixes: fc6eead7c1e2 ("time: Clean up CLOCK_MONOTONIC_RAW time handling") Reported-and-tested-by: Chris Wilson Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Kevin Brodsky Cc: Richard Cochran Cc: Stephen Boyd Cc: Will Deacon Cc: Miroslav Lichvar Cc: Daniel Mentz Link: http://lkml.kernel.org/r/1503701824-1645-1-git-send-email-john.stultz@linaro.org --- kernel/time/timekeeping.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cedafa008de5..7e7e61c00d61 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -637,9 +637,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->ktime_sec = seconds; /* Update the monotonic raw base */ - seconds = tk->raw_sec; - nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); - tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); + tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* must hold timekeeper_lock */ -- cgit v1.2.3 From 3510ca20ece0150af6b10c77a74ff1b5c198e3e2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 27 Aug 2017 13:55:12 -0700 Subject: Minor page waitqueue cleanups Tim Chen and Kan Liang have been battling a customer load that shows extremely long page wakeup lists. The cause seems to be constant NUMA migration of a hot page that is shared across a lot of threads, but the actual root cause for the exact behavior has not been found. Tim has a patch that batches the wait list traversal at wakeup time, so that we at least don't get long uninterruptible cases where we traverse and wake up thousands of processes and get nasty latency spikes. That is likely 4.14 material, but we're still discussing the page waitqueue specific parts of it. In the meantime, I've tried to look at making the page wait queues less expensive, and failing miserably. If you have thousands of threads waiting for the same page, it will be painful. We'll need to try to figure out the NUMA balancing issue some day, in addition to avoiding the excessive spinlock hold times. That said, having tried to rewrite the page wait queues, I can at least fix up some of the braindamage in the current situation. In particular: (a) we don't want to continue walking the page wait list if the bit we're waiting for already got set again (which seems to be one of the patterns of the bad load). That makes no progress and just causes pointless cache pollution chasing the pointers. (b) we don't want to put the non-locking waiters always on the front of the queue, and the locking waiters always on the back. Not only is that unfair, it means that we wake up thousands of reading threads that will just end up being blocked by the writer later anyway. Also add a comment about the layout of 'struct wait_page_key' - there is an external user of it in the cachefiles code that means that it has to match the layout of 'struct wait_bit_key' in the two first members. It so happens to match, because 'struct page *' and 'unsigned long *' end up having the same values simply because the page flags are the first member in struct page. Cc: Tim Chen Cc: Kan Liang Cc: Mel Gorman Cc: Christopher Lameter Cc: Andi Kleen Cc: Davidlohr Bueso Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 17f11c6b0a9f..d6afed6d0752 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, list_for_each_entry_safe(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + int ret = curr->func(curr, mode, wake_flags, key); + if (ret < 0) + break; + if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } } -- cgit v1.2.3 From 464bc0fd6273d518aee79fbd37211dd9bc35d863 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:10:04 -0700 Subject: bpf: convert sockmap field attach_bpf_fd2 to type In the initial sockmap API we provided strparser and verdict programs using a single attach command by extending the attach API with a the attach_bpf_fd2 field. However, if we add other programs in the future we will be adding a field for every new possible type, attach_bpf_fd(3,4,..). This seems a bit clumsy for an API. So lets push the programs using two new type fields. BPF_SK_SKB_STREAM_PARSER BPF_SK_SKB_STREAM_VERDICT This has the advantage of having a readable name and can easily be extended in the future. Updates to samples and sockmap included here also generalize tests slightly to support upcoming patch for multiple map support. Signed-off-by: John Fastabend Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 25 ++++++++++++++----------- kernel/bpf/syscall.c | 38 +++++++++++--------------------------- 2 files changed, 25 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 617c239590c2..cf570d108fd5 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -723,20 +723,24 @@ out: return err; } -static int sock_map_attach_prog(struct bpf_map *map, - struct bpf_prog *parse, - struct bpf_prog *verdict) +int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct bpf_prog *_parse, *_verdict; + struct bpf_prog *orig; - _parse = xchg(&stab->bpf_parse, parse); - _verdict = xchg(&stab->bpf_verdict, verdict); + switch (type) { + case BPF_SK_SKB_STREAM_PARSER: + orig = xchg(&stab->bpf_parse, prog); + break; + case BPF_SK_SKB_STREAM_VERDICT: + orig = xchg(&stab->bpf_verdict, prog); + break; + default: + return -EOPNOTSUPP; + } - if (_parse) - bpf_prog_put(_parse); - if (_verdict) - bpf_prog_put(_verdict); + if (orig) + bpf_prog_put(orig); return 0; } @@ -777,7 +781,6 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, - .map_attach = sock_map_attach_prog, }; BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9378f3ba2cbf..021a05d9d800 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1093,12 +1093,12 @@ static int bpf_obj_get(const union bpf_attr *attr) #ifdef CONFIG_CGROUP_BPF -#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2 +#define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) +static int sockmap_get_from_fd(const union bpf_attr *attr) { - struct bpf_prog *prog1, *prog2; int ufd = attr->target_fd; + struct bpf_prog *prog; struct bpf_map *map; struct fd f; int err; @@ -1108,29 +1108,16 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) if (IS_ERR(map)) return PTR_ERR(map); - if (!map->ops->map_attach) { - fdput(f); - return -EOPNOTSUPP; - } - - prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype); - if (IS_ERR(prog1)) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB); + if (IS_ERR(prog)) { fdput(f); - return PTR_ERR(prog1); - } - - prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype); - if (IS_ERR(prog2)) { - fdput(f); - bpf_prog_put(prog1); - return PTR_ERR(prog2); + return PTR_ERR(prog); } - err = map->ops->map_attach(map, prog1, prog2); + err = sock_map_attach_prog(map, prog, attr->attach_type); if (err) { fdput(f); - bpf_prog_put(prog1); - bpf_prog_put(prog2); + bpf_prog_put(prog); return err; } @@ -1165,16 +1152,13 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; - case BPF_CGROUP_SMAP_INGRESS: - ptype = BPF_PROG_TYPE_SK_SKB; - break; + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + return sockmap_get_from_fd(attr); default: return -EINVAL; } - if (attr->attach_type == BPF_CGROUP_SMAP_INGRESS) - return sockmap_get_from_fd(attr, ptype); - prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); -- cgit v1.2.3 From 2f857d04601a1bb56958b95a9f180bce0e91e5e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:10:25 -0700 Subject: bpf: sockmap, remove STRPARSER map_flags and add multi-map support The addition of map_flags BPF_SOCKMAP_STRPARSER flags was to handle a specific use case where we want to have BPF parse program disabled on an entry in a sockmap. However, Alexei found the API a bit cumbersome and I agreed. Lets remove the STRPARSER flag and support the use case by allowing socks to be in multiple maps. This allows users to create two maps one with programs attached and one without. When socks are added to maps they now inherit any programs attached to the map. This is a nice generalization and IMO improves the API. The API rules are less ambiguous and do not need a flag: - When a sock is added to a sockmap we have two cases, i. The sock map does not have any attached programs so we can add sock to map without inheriting bpf programs. The sock may exist in 0 or more other maps. ii. The sock map has an attached BPF program. To avoid duplicate bpf programs we only add the sock entry if it does not have an existing strparser/verdict attached, returning -EBUSY if a program is already attached. Otherwise attach the program and inherit strparser/verdict programs from the sock map. This allows for socks to be in a multiple maps for redirects and inherit a BPF program from a single map. Also this patch simplifies the logic around BPF_{EXIST|NOEXIST|ANY} flags. In the original patch I tried to be extra clever and only update map entries when necessary. Now I've decided the complexity is not worth it. If users constantly update an entry with the same sock for no reason (i.e. update an entry without actually changing any parameters on map or sock) we still do an alloc/release. Using this and allowing multiple entries of a sock to exist in a map the logic becomes much simpler. Note: Now that multiple maps are supported the "maps" pointer called when a socket is closed becomes a list of maps to remove the sock from. To keep the map up to date when a sock is added to the sockmap we must add the map/elem in the list. Likewise when it is removed we must remove it from the list. This results in searching the per psock list on delete operation. On TCP_CLOSE events we walk the list and remove the psock from all map/entry locations. I don't see any perf implications in this because at most I have a psock in two maps. If a psock were to be in many maps its possibly this might be noticeable on delete but I can't think of a reason to dup a psock in many maps. The sk_callback_lock is used to protect read/writes to the list. This was convenient because in all locations we were taking the lock anyways just after working on the list. Also the lock is per sock so in normal cases we shouldn't see any contention. Suggested-by: Alexei Starovoitov Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 269 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 165 insertions(+), 104 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf570d108fd5..a6882e54930b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -13,15 +13,16 @@ /* A BPF sock_map is used to store sock objects. This is primarly used * for doing socket redirect with BPF helper routines. * - * A sock map may have two BPF programs attached to it, a program used - * to parse packets and a program to provide a verdict and redirect - * decision on the packet. If no BPF parse program is provided it is - * assumed that every skb is a "message" (skb->len). Otherwise the - * parse program is attached to strparser and used to build messages - * that may span multiple skbs. The verdict program will either select - * a socket to send/receive the skb on or provide the drop code indicating - * the skb should be dropped. More actions may be added later as needed. - * The default program will drop packets. + * A sock map may have BPF programs attached to it, currently a program + * used to parse packets and a program to provide a verdict and redirect + * decision on the packet are supported. Any programs attached to a sock + * map are inherited by sock objects when they are added to the map. If + * no BPF programs are attached the sock object may only be used for sock + * redirect. + * + * A sock object may be in multiple maps, but can only inherit a single + * parse or verdict program. If adding a sock object to a map would result + * in having multiple parsing programs the update will return an EBUSY error. * * For reference this program is similar to devmap used in XDP context * reviewing these together may be useful. For an example please review @@ -44,15 +45,21 @@ struct bpf_stab { struct sock **sock_map; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; - refcount_t refcnt; }; enum smap_psock_state { SMAP_TX_RUNNING, }; +struct smap_psock_map_entry { + struct list_head list; + struct sock **entry; +}; + struct smap_psock { struct rcu_head rcu; + /* refcnt is used inside sk_callback_lock */ + u32 refcnt; /* datapath variables */ struct sk_buff_head rxqueue; @@ -66,10 +73,9 @@ struct smap_psock { struct strparser strp; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; - struct bpf_stab *stab; + struct list_head maps; /* Back reference used when sock callback trigger sockmap operations */ - int key; struct sock *sock; unsigned long state; @@ -83,7 +89,7 @@ struct smap_psock { static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { - return (struct smap_psock *)rcu_dereference_sk_user_data(sk); + return rcu_dereference_sk_user_data(sk); } static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) @@ -149,11 +155,12 @@ static void smap_report_sk_error(struct smap_psock *psock, int err) sk->sk_error_report(sk); } -static void smap_release_sock(struct sock *sock); +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); /* Called with lock_sock(sk) held */ static void smap_state_change(struct sock *sk) { + struct smap_psock_map_entry *e, *tmp; struct smap_psock *psock; struct sock *osk; @@ -184,9 +191,15 @@ static void smap_state_change(struct sock *sk) psock = smap_psock_sk(sk); if (unlikely(!psock)) break; - osk = cmpxchg(&psock->stab->sock_map[psock->key], sk, NULL); - if (osk == sk) - smap_release_sock(sk); + write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } + write_unlock_bh(&sk->sk_callback_lock); break; default: psock = smap_psock_sk(sk); @@ -289,9 +302,8 @@ static void smap_write_space(struct sock *sk) static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) { - write_lock_bh(&sk->sk_callback_lock); if (!psock->strp_enabled) - goto out; + return; sk->sk_data_ready = psock->save_data_ready; sk->sk_write_space = psock->save_write_space; sk->sk_state_change = psock->save_state_change; @@ -300,8 +312,6 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) psock->save_state_change = NULL; strp_stop(&psock->strp); psock->strp_enabled = false; -out: - write_unlock_bh(&sk->sk_callback_lock); } static void smap_destroy_psock(struct rcu_head *rcu) @@ -318,9 +328,11 @@ static void smap_destroy_psock(struct rcu_head *rcu) schedule_work(&psock->gc_work); } -static void smap_release_sock(struct sock *sock) +static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { - struct smap_psock *psock = smap_psock_sk(sock); + psock->refcnt--; + if (psock->refcnt) + return; smap_stop_sock(psock, sock); clear_bit(SMAP_TX_RUNNING, &psock->state); @@ -414,6 +426,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab) static void smap_gc_work(struct work_struct *w) { + struct smap_psock_map_entry *e, *tmp; struct smap_psock *psock; psock = container_of(w, struct smap_psock, gc_work); @@ -431,8 +444,10 @@ static void smap_gc_work(struct work_struct *w) if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); - if (refcount_dec_and_test(&psock->stab->refcnt)) - sock_map_remove_complete(psock->stab); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + list_del(&e->list); + kfree(e); + } sock_put(psock->sock); kfree(psock); @@ -453,6 +468,8 @@ static struct smap_psock *smap_init_psock(struct sock *sock, skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); + INIT_LIST_HEAD(&psock->maps); + psock->refcnt = 1; rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -503,13 +520,24 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (!stab->sock_map) goto free_stab; - refcount_set(&stab->refcnt, 1); return &stab->map; free_stab: kfree(stab); return ERR_PTR(err); } +static void smap_list_remove(struct smap_psock *psock, struct sock **entry) +{ + struct smap_psock_map_entry *e, *tmp; + + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + if (e->entry == entry) { + list_del(&e->list); + break; + } + } +} + static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); @@ -526,13 +554,18 @@ static void sock_map_free(struct bpf_map *map) */ rcu_read_lock(); for (i = 0; i < stab->map.max_entries; i++) { + struct smap_psock *psock; struct sock *sock; sock = xchg(&stab->sock_map[i], NULL); if (!sock) continue; - smap_release_sock(sock); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); @@ -541,8 +574,7 @@ static void sock_map_free(struct bpf_map *map) if (stab->bpf_parse) bpf_prog_put(stab->bpf_parse); - if (refcount_dec_and_test(&stab->refcnt)) - sock_map_remove_complete(stab); + sock_map_remove_complete(stab); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -576,6 +608,7 @@ struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static int sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct smap_psock *psock; int k = *(u32 *)key; struct sock *sock; @@ -586,7 +619,17 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (!sock) return -EINVAL; - smap_release_sock(sock); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + if (!psock) + goto out; + + if (psock->bpf_parse) + smap_stop_sock(psock, sock); + smap_list_remove(psock, &stab->sock_map[k]); + smap_release_sock(psock, sock); +out: + write_unlock_bh(&sock->sk_callback_lock); return 0; } @@ -601,29 +644,34 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) * and syncd so we are certain all references from the update/lookup/delete * operations as well as references in the data path are no longer in use. * - * A psock object holds a refcnt on the sockmap it is attached to and this is - * not decremented until after a RCU grace period and garbage collection occurs. - * This ensures the map is not free'd until psocks linked to it are removed. The - * map link is used when the independent sock events trigger map deletion. + * Psocks may exist in multiple maps, but only a single set of parse/verdict + * programs may be inherited from the maps it belongs to. A reference count + * is kept with the total number of references to the psock from all maps. The + * psock will not be released until this reaches zero. The psock and sock + * user data data use the sk_callback_lock to protect critical data structures + * from concurrent access. This allows us to avoid two updates from modifying + * the user data in sock and the lock is required anyways for modifying + * callbacks, we simply increase its scope slightly. * - * Psocks may only participate in one sockmap at a time. Users that try to - * join a single sock to multiple maps will get an error. - * - * Last, but not least, it is possible the socket is closed while running - * an update on an existing psock. This will release the psock, but again - * not until the update has completed due to rcu grace period rules. + * Rules to follow, + * - psock must always be read inside RCU critical section + * - sk_user_data must only be modified inside sk_callback_lock and read + * inside RCU critical section. + * - psock->maps list must only be read & modified inside sk_callback_lock + * - sock_map must use READ_ONCE and (cmp)xchg operations + * - BPF verdict/parse programs must use READ_ONCE and xchg operations */ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct bpf_map *map, - void *key, u64 flags, u64 map_flags) + void *key, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct smap_psock_map_entry *e = NULL; struct bpf_prog *verdict, *parse; - struct smap_psock *psock = NULL; - struct sock *old_sock, *sock; + struct sock *osock, *sock; + struct smap_psock *psock; u32 i = *(u32 *)key; - bool update = false; - int err = 0; + int err; if (unlikely(flags > BPF_EXIST)) return -EINVAL; @@ -631,35 +679,22 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (unlikely(i >= stab->map.max_entries)) return -E2BIG; - if (unlikely(map_flags > BPF_SOCKMAP_STRPARSER)) - return -EINVAL; - - verdict = parse = NULL; sock = READ_ONCE(stab->sock_map[i]); - - if (flags == BPF_EXIST || flags == BPF_ANY) { - if (!sock && flags == BPF_EXIST) { - return -ENOENT; - } else if (sock && sock != skops->sk) { - return -EINVAL; - } else if (sock) { - psock = smap_psock_sk(sock); - if (unlikely(!psock)) - return -EBUSY; - update = true; - } - } else if (sock && BPF_NOEXIST) { + if (flags == BPF_EXIST && !sock) + return -ENOENT; + else if (flags == BPF_NOEXIST && sock) return -EEXIST; - } - /* reserve BPF programs early so can abort easily on failures */ - if (map_flags & BPF_SOCKMAP_STRPARSER) { - verdict = READ_ONCE(stab->bpf_verdict); - parse = READ_ONCE(stab->bpf_parse); + sock = skops->sk; - if (!verdict || !parse) - return -ENOENT; + /* 1. If sock map has BPF programs those will be inherited by the + * sock being added. If the sock is already attached to BPF programs + * this results in an error. + */ + verdict = READ_ONCE(stab->bpf_verdict); + parse = READ_ONCE(stab->bpf_parse); + if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation * removes the program after the above READ_ONCE() but before * we increment the refcnt. If this is the case abort with an @@ -676,50 +711,78 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } - if (!psock) { - sock = skops->sk; - if (rcu_dereference_sk_user_data(sock)) - return -EEXIST; + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + + /* 2. Do not allow inheriting programs if psock exists and has + * already inherited programs. This would create confusion on + * which parser/verdict program is running. If no psock exists + * create one. Inside sk_callback_lock to ensure concurrent create + * doesn't update user data. + */ + if (psock) { + if (READ_ONCE(psock->bpf_parse) && parse) { + err = -EBUSY; + goto out_progs; + } + psock->refcnt++; + } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { - if (verdict) - bpf_prog_put(verdict); - if (parse) - bpf_prog_put(parse); - return PTR_ERR(psock); + err = PTR_ERR(psock); + goto out_progs; } - psock->key = i; - psock->stab = stab; - refcount_inc(&stab->refcnt); + set_bit(SMAP_TX_RUNNING, &psock->state); } - if (map_flags & BPF_SOCKMAP_STRPARSER) { - write_lock_bh(&sock->sk_callback_lock); - if (psock->strp_enabled) - goto start_done; + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) { + err = -ENOMEM; + goto out_progs; + } + e->entry = &stab->sock_map[i]; + + /* 3. At this point we have a reference to a valid psock that is + * running. Attach any BPF programs needed. + */ + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) - goto out; + goto out_free; smap_init_progs(psock, stab, verdict, parse); smap_start_sock(psock, sock); -start_done: - write_unlock_bh(&sock->sk_callback_lock); - } else if (update) { - smap_stop_sock(psock, sock); } - if (!update) { - old_sock = xchg(&stab->sock_map[i], skops->sk); - if (old_sock) - smap_release_sock(old_sock); - } + /* 4. Place psock in sockmap for use and stop any programs on + * the old sock assuming its not the same sock we are replacing + * it with. Because we can only have a single set of programs if + * old_sock has a strp we can stop it. + */ + list_add_tail(&e->list, &psock->maps); + write_unlock_bh(&sock->sk_callback_lock); + osock = xchg(&stab->sock_map[i], sock); + if (osock) { + struct smap_psock *opsock = smap_psock_sk(osock); + + write_lock_bh(&osock->sk_callback_lock); + if (osock != sock && parse) + smap_stop_sock(opsock, osock); + smap_list_remove(opsock, &stab->sock_map[i]); + smap_release_sock(opsock, osock); + write_unlock_bh(&osock->sk_callback_lock); + } return 0; -out: +out_free: + smap_release_sock(psock, sock); +out_progs: + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); write_unlock_bh(&sock->sk_callback_lock); - if (!update) - smap_release_sock(sock); + kfree(e); return err; } @@ -768,8 +831,7 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } - err = sock_map_ctx_update_elem(&skops, map, key, - flags, BPF_SOCKMAP_STRPARSER); + err = sock_map_ctx_update_elem(&skops, map, key, flags); fput(socket->file); return err; } @@ -783,11 +845,11 @@ const struct bpf_map_ops sock_map_ops = { .map_delete_elem = sock_map_delete_elem, }; -BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags, u64, map_flags) +BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); - return sock_map_ctx_update_elem(bpf_sock, map, key, flags, map_flags); + return sock_map_ctx_update_elem(bpf_sock, map, key, flags); } const struct bpf_func_proto bpf_sock_map_update_proto = { @@ -799,5 +861,4 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, - .arg5_type = ARG_ANYTHING, }; -- cgit v1.2.3 From d26e597d87635d90128fafb3f6bb0a14d972d952 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:10:45 -0700 Subject: bpf: sockmap add missing rcu_read_(un)lock in smap_data_ready References to psock must be done inside RCU critical section. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a6882e54930b..266011c822ec 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -227,11 +227,14 @@ static void smap_data_ready(struct sock *sk) { struct smap_psock *psock; - write_lock_bh(&sk->sk_callback_lock); + rcu_read_lock(); psock = smap_psock_sk(sk); - if (likely(psock)) + if (likely(psock)) { + write_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); + write_unlock_bh(&sk->sk_callback_lock); + } + rcu_read_unlock(); } static void smap_tx_work(struct work_struct *w) -- cgit v1.2.3 From 81374aaa2693f8d3cd6cf3656a02ac8cf5c7ebea Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:11:43 -0700 Subject: bpf: harden sockmap program attach to ensure correct map type When attaching a program to sockmap we need to check map type is correct. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 266011c822ec..38bf4e4ae2fd 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -794,6 +794,9 @@ int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_prog *orig; + if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP)) + return -EINVAL; + switch (type) { case BPF_SK_SKB_STREAM_PARSER: orig = xchg(&stab->bpf_parse, prog); -- cgit v1.2.3 From 78aeaaef997db7096a17d0d3572a7940ffa5c9a0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:12:01 -0700 Subject: bpf: sockmap indicate sock events to listeners After userspace pushes sockets into a sockmap it may not be receiving data (assuming stream_{parser|verdict} programs are attached). But, it may still want to manage the socks. A common pattern is to poll/select for a POLLRDHUP event so we can close the sock. This patch adds the logic to wake up these listeners. Also add TCP_SYN_SENT to the list of events to handle. We don't want to break the connection just because we happen to be in this state. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 38bf4e4ae2fd..bcc326a2e5ce 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -162,6 +162,7 @@ static void smap_state_change(struct sock *sk) { struct smap_psock_map_entry *e, *tmp; struct smap_psock *psock; + struct socket_wq *wq; struct sock *osk; rcu_read_lock(); @@ -171,6 +172,7 @@ static void smap_state_change(struct sock *sk) * is established. */ switch (sk->sk_state) { + case TCP_SYN_SENT: case TCP_SYN_RECV: case TCP_ESTABLISHED: break; @@ -208,6 +210,10 @@ static void smap_state_change(struct sock *sk) smap_report_sk_error(psock, EPIPE); break; } + + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } -- cgit v1.2.3 From f740c34ee5cf62e1fff5ebec6d0e63efcc3cdfe9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 25 Aug 2017 23:27:14 +0300 Subject: bpf: fix oops on allocation failure "err" is set to zero if bpf_map_area_alloc() fails so it means we return ERR_PTR(0) which is NULL. The caller, find_and_alloc_map(), is not expecting NULL returns and will oops. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: Dan Carpenter Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index bcc326a2e5ce..db0d99d2fe18 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -523,6 +523,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (err) goto free_stab; + err = -ENOMEM; stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); -- cgit v1.2.3 From f12f42acdbb577a12eecfcebbbec41c81505c4dc Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 23 Aug 2017 17:07:50 -0400 Subject: perf/core: Fix potential double-fetch bug While examining the kernel source code, I found a dangerous operation that could turn into a double-fetch situation (a race condition bug) where the same userspace memory region are fetched twice into kernel with sanity checks after the first fetch while missing checks after the second fetch. 1. The first fetch happens in line 9573 get_user(size, &uattr->size). 2. Subsequently the 'size' variable undergoes a few sanity checks and transformations (line 9577 to 9584). 3. The second fetch happens in line 9610 copy_from_user(attr, uattr, size) 4. Given that 'uattr' can be fully controlled in userspace, an attacker can race condition to override 'uattr->size' to arbitrary value (say, 0xFFFFFFFF) after the first fetch but before the second fetch. The changed value will be copied to 'attr->size'. 5. There is no further checks on 'attr->size' until the end of this function, and once the function returns, we lose the context to verify that 'attr->size' conforms to the sanity checks performed in step 2 (line 9577 to 9584). 6. My manual analysis shows that 'attr->size' is not used elsewhere later, so, there is no working exploit against it right now. However, this could easily turns to an exploitable one if careless developers start to use 'attr->size' later. To fix this, override 'attr->size' from the second fetch to the one from the first fetch, regardless of what is actually copied in. In this way, it is assured that 'attr->size' is consistent with the checks performed after the first fetch. Signed-off-by: Meng Xu Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: meng.xu@gatech.edu Cc: sanidhya@gatech.edu Cc: taesoo@gatech.edu Link: http://lkml.kernel.org/r/1503522470-35531-1-git-send-email-meng.xu@gatech.edu Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3504125871d2..ce131d25622a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9611,6 +9611,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + attr->size = size; + if (attr->__reserved_1) return -EINVAL; -- cgit v1.2.3 From 75e8387685f6c65feb195a4556110b58f852b848 Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Fri, 25 Aug 2017 21:49:37 +0800 Subject: perf/ftrace: Fix double traces of perf on ftrace:function When running perf on the ftrace:function tracepoint, there is a bug which can be reproduced by: perf record -e ftrace:function -a sleep 20 & perf record -e ftrace:function ls perf script ls 10304 [005] 171.853235: ftrace:function: perf_output_begin ls 10304 [005] 171.853237: ftrace:function: perf_output_begin ls 10304 [005] 171.853239: ftrace:function: task_tgid_nr_ns ls 10304 [005] 171.853240: ftrace:function: task_tgid_nr_ns ls 10304 [005] 171.853242: ftrace:function: __task_pid_nr_ns ls 10304 [005] 171.853244: ftrace:function: __task_pid_nr_ns We can see that all the function traces are doubled. The problem is caused by the inconsistency of the register function perf_ftrace_event_register() with the probe function perf_ftrace_function_call(). The former registers one probe for every perf_event. And the latter handles all perf_events on the current cpu. So when two perf_events on the current cpu, the traces of them will be doubled. So this patch adds an extra parameter "event" for perf_tp_event, only send sample data to this event when it's not NULL. Signed-off-by: Zhou Chengming Reviewed-by: Jiri Olsa Acked-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: huawei.libin@huawei.com Link: http://lkml.kernel.org/r/1503668977-12526-1-git-send-email-zhouchengming1@huawei.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 +++++++++---- kernel/trace/trace_event_perf.c | 4 +++- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_syscalls.c | 4 ++-- kernel/trace/trace_uprobe.c | 2 +- 5 files changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ce131d25622a..03ac9c8b02fb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7906,16 +7906,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, - rctx, task); + rctx, task, NULL); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task) + struct task_struct *task, struct perf_event *event) { struct perf_sample_data data; - struct perf_event *event; struct perf_raw_record raw = { .frag = { @@ -7929,9 +7928,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); - hlist_for_each_entry_rcu(event, head, hlist_entry) { + /* Use the given event instead of the hlist */ + if (event) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); + } else { + hlist_for_each_entry_rcu(event, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } } /* diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 562fa69df5d3..13ba2d3f6a91 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -306,6 +306,7 @@ static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *pt_regs) { + struct perf_event *event; struct ftrace_entry *entry; struct hlist_head *head; struct pt_regs regs; @@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; + event = container_of(ops, struct perf_event, ftrace_ops); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, - 1, ®s, head, NULL); + 1, ®s, head, NULL, event); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9b5aa10fbf9..8a907e12b6b9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL); + head, NULL, NULL); } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL); + head, NULL, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e10395da88e..74d9a86eccc0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -596,7 +596,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) (unsigned long *)&rec->args); perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, - head, NULL); + head, NULL, NULL); } static int perf_sysenter_enable(struct trace_event_call *call) @@ -667,7 +667,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, - 1, regs, head, NULL); + 1, regs, head, NULL, NULL); } static int perf_sysexit_enable(struct trace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a7581fec9681..4525e0271a53 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, } perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL); + head, NULL, NULL); out: preempt_enable(); } -- cgit v1.2.3 From 8d4e6c4caa12dafbcba138e5450b7af17b0b2194 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Mar 2017 18:39:56 +0300 Subject: perf/core, pt, bts: Get rid of itrace_started I just noticed that hw.itrace_started and hw.config are aliased to the same location. Now, the PT driver happens to use both, which works out fine by sheer luck: - STORE(hw.itrace_start) is ordered before STORE(hw.config), in the program order, although there are no compiler barriers to ensure that, - to the perf_log_itrace_start() hw.itrace_start looks set at the same time as when it is intended to be set because both stores happen in the same path, - hw.config is never reset to zero in the PT driver. Now, the use of hw.config by the PT driver makes more sense (it being a HW PMU) than messing around with itrace_started, which is an awkward API to begin with. This patch replaces hw.itrace_started with an attach_state bit and an API call for the PMU drivers to use to communicate the condition. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/20170330153956.25994-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e5467e107624..77fd6b11ef22 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7301,6 +7301,11 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +void perf_event_itrace_started(struct perf_event *event) +{ + event->attach_state |= PERF_ATTACH_ITRACE; +} + static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; @@ -7316,7 +7321,7 @@ static void perf_log_itrace_start(struct perf_event *event) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || - event->hw.itrace_started) + event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; -- cgit v1.2.3 From fc7ce9c74c3ad232b084d80148654f926d01ece7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 28 Aug 2017 20:52:49 -0400 Subject: perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR For understanding how the workload maps to memory channels and hardware behavior, it's very important to collect address maps with physical addresses. For example, 3D XPoint access can only be found by filtering the physical address. Add a new sample type for physical address. perf already has a facility to collect data virtual address. This patch introduces a function to convert the virtual address to physical address. The function is quite generic and can be extended to any architecture as long as a virtual address is provided. - For kernel direct mapping addresses, virt_to_phys is used to convert the virtual addresses to physical address. - For user virtual addresses, __get_user_pages_fast is used to walk the pages tables for user physical address. - This does not work for vmalloc addresses right now. These are not resolved, but code to do that could be added. The new sample type requires collecting the virtual address. The virtual address will not be output unless SAMPLE_ADDR is applied. For security, the physical address can only be exposed to root or privileged user. Tested-by: Madhavan Srinivasan Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mpe@ellerman.id.au Link: http://lkml.kernel.org/r/1503967969-48278-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 77fd6b11ef22..ce64f3fed5c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1575,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + size += sizeof(data->phys_addr); + event->header_size = size; } @@ -6017,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle, } } + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + perf_output_put(handle, data->phys_addr); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -6032,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle, } } +static u64 perf_virt_to_phys(u64 virt) +{ + u64 phys_addr = 0; + struct page *p = NULL; + + if (!virt) + return 0; + + if (virt >= TASK_SIZE) { + /* If it's vmalloc()d memory, leave phys_addr as 0 */ + if (virt_addr_valid((void *)(uintptr_t)virt) && + !(virt >= VMALLOC_START && virt < VMALLOC_END)) + phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); + } else { + /* + * Walking the pages tables for user address. + * Interrupts are disabled, so it prevents any tear down + * of the page tables. + * Try IRQ-safe __get_user_pages_fast first. + * If failed, leave phys_addr as 0. + */ + if ((current->mm != NULL) && + (__get_user_pages_fast(virt, 1, 0, &p) == 1)) + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + + if (p) + put_page(p); + } + + return phys_addr; +} + void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, @@ -6150,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + data->phys_addr = perf_virt_to_phys(data->addr); } static void __always_inline @@ -9909,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* Only privileged users can get physical addresses */ + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!attr.sample_max_stack) attr.sample_max_stack = sysctl_perf_event_max_stack; -- cgit v1.2.3 From f52be5708076b75a045ac52c6fef3fffb8300525 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Aug 2017 10:59:39 +0200 Subject: locking/lockdep: Untangle xhlock history save/restore from task independence Where XHLOCK_{SOFT,HARD} are save/restore points in the xhlocks[] to ensure the temporal IRQ events don't interact with task state, the XHLOCK_PROC is a fundament different beast that just happens to share the interface. The purpose of XHLOCK_PROC is to annotate independent execution inside one task. For example workqueues, each work should appear to run in its own 'pristine' 'task'. Remove XHLOCK_PROC in favour of its own interface to avoid confusion. Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boqun.feng@gmail.com Cc: david@fromorbit.com Cc: johannes@sipsolutions.net Cc: kernel-team@lge.com Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20170829085939.ggmb6xiohw67micb@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 79 +++++++++++++++++++++++------------------------- kernel/workqueue.c | 9 +++--- 2 files changed, 42 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f73ca595b81e..44c8d0d17170 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4623,13 +4623,8 @@ asmlinkage __visible void lockdep_sys_exit(void) /* * The lock history for each syscall should be independent. So wipe the * slate clean on return to userspace. - * - * crossrelease_hist_end() works well here even when getting here - * without starting (i.e. just after forking), because it rolls back - * the index to point to the last entry, which is already invalid. */ - crossrelease_hist_end(XHLOCK_PROC); - crossrelease_hist_start(XHLOCK_PROC, false); + lockdep_invariant_state(false); } void lockdep_rcu_suspicious(const char *file, const int line, const char *s) @@ -4723,19 +4718,47 @@ static inline void invalidate_xhlock(struct hist_lock *xhlock) } /* - * Lock history stacks; we have 3 nested lock history stacks: + * Lock history stacks; we have 2 nested lock history stacks: * * HARD(IRQ) * SOFT(IRQ) - * PROC(ess) * * The thing is that once we complete a HARD/SOFT IRQ the future task locks * should not depend on any of the locks observed while running the IRQ. So * what we do is rewind the history buffer and erase all our knowledge of that * temporal event. - * - * The PROCess one is special though; it is used to annotate independence - * inside a task. + */ + +void crossrelease_hist_start(enum xhlock_context_t c) +{ + struct task_struct *cur = current; + + if (!cur->xhlocks) + return; + + cur->xhlock_idx_hist[c] = cur->xhlock_idx; + cur->hist_id_save[c] = cur->hist_id; +} + +void crossrelease_hist_end(enum xhlock_context_t c) +{ + struct task_struct *cur = current; + + if (cur->xhlocks) { + unsigned int idx = cur->xhlock_idx_hist[c]; + struct hist_lock *h = &xhlock(idx); + + cur->xhlock_idx = idx; + + /* Check if the ring was overwritten. */ + if (h->hist_id != cur->hist_id_save[c]) + invalidate_xhlock(h); + } +} + +/* + * lockdep_invariant_state() is used to annotate independence inside a task, to + * make one task look like multiple independent 'tasks'. * * Take for instance workqueues; each work is independent of the last. The * completion of a future work does not depend on the completion of a past work @@ -4758,40 +4781,14 @@ static inline void invalidate_xhlock(struct hist_lock *xhlock) * entry. Similarly, independence per-definition means it does not depend on * prior state. */ -void crossrelease_hist_start(enum xhlock_context_t c, bool force) +void lockdep_invariant_state(bool force) { - struct task_struct *cur = current; - - if (!cur->xhlocks) - return; - /* * We call this at an invariant point, no current state, no history. + * Verify the former, enforce the latter. */ - if (c == XHLOCK_PROC) { - /* verified the former, ensure the latter */ - WARN_ON_ONCE(!force && cur->lockdep_depth); - invalidate_xhlock(&xhlock(cur->xhlock_idx)); - } - - cur->xhlock_idx_hist[c] = cur->xhlock_idx; - cur->hist_id_save[c] = cur->hist_id; -} - -void crossrelease_hist_end(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (cur->xhlocks) { - unsigned int idx = cur->xhlock_idx_hist[c]; - struct hist_lock *h = &xhlock(idx); - - cur->xhlock_idx = idx; - - /* Check if the ring was overwritten. */ - if (h->hist_id != cur->hist_id_save[c]) - invalidate_xhlock(h); - } + WARN_ON_ONCE(!force && current->lockdep_depth); + invalidate_xhlock(&xhlock(current->xhlock_idx)); } static int cross_lock(struct lockdep_map *lock) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0331891dec1..ab3c0dc8c7ed 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2094,8 +2094,8 @@ __acquires(&pool->lock) lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); /* - * Strictly speaking we should do start(PROC) without holding any - * locks, that is, before these two lock_map_acquire()'s. + * Strictly speaking we should mark the invariant state without holding + * any locks, that is, before these two lock_map_acquire()'s. * * However, that would result in: * @@ -2107,14 +2107,14 @@ __acquires(&pool->lock) * Which would create W1->C->W1 dependencies, even though there is no * actual deadlock possible. There are two solutions, using a * read-recursive acquire on the work(queue) 'locks', but this will then - * hit the lockdep limitation on recursive locks, or simly discard + * hit the lockdep limitation on recursive locks, or simply discard * these locks. * * AFAICT there is no possible deadlock scenario between the * flush_work() and complete() primitives (except for single-threaded * workqueues), so hiding them isn't a problem. */ - crossrelease_hist_start(XHLOCK_PROC, true); + lockdep_invariant_state(true); trace_workqueue_execute_start(work); worker->current_func(work); /* @@ -2122,7 +2122,6 @@ __acquires(&pool->lock) * point will only record its address. */ trace_workqueue_execute_end(work); - crossrelease_hist_end(XHLOCK_PROC); lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); -- cgit v1.2.3 From 966a967116e699762dbf4af7f9e0d1955c25aa37 Mon Sep 17 00:00:00 2001 From: Ying Huang Date: Tue, 8 Aug 2017 12:30:00 +0800 Subject: smp: Avoid using two cache lines for struct call_single_data struct call_single_data is used in IPIs to transfer information between CPUs. Its size is bigger than sizeof(unsigned long) and less than cache line size. Currently it is not allocated with any explicit alignment requirements. This makes it possible for allocated call_single_data to cross two cache lines, which results in double the number of the cache lines that need to be transferred among CPUs. This can be fixed by requiring call_single_data to be aligned with the size of call_single_data. Currently the size of call_single_data is the power of 2. If we add new fields to call_single_data, we may need to add padding to make sure the size of new definition is the power of 2 as well. Fortunately, this is enforced by GCC, which will report bad sizes. To set alignment requirements of call_single_data to the size of call_single_data, a struct definition and a typedef is used. To test the effect of the patch, I used the vm-scalability multiple thread swap test case (swap-w-seq-mt). The test will create multiple threads and each thread will eat memory until all RAM and part of swap is used, so that huge number of IPIs are triggered when unmapping memory. In the test, the throughput of memory writing improves ~5% compared with misaligned call_single_data, because of faster IPIs. Suggested-by: Peter Zijlstra Signed-off-by: Huang, Ying [ Add call_single_data_t and align with size of call_single_data. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Borislav Petkov Cc: Eric Dumazet Cc: Juergen Gross Cc: Linus Torvalds Cc: Michael Ellerman Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/87bmnqd6lz.fsf@yhuang-mobile.sh.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 2 +- kernel/smp.c | 32 +++++++++++++++++--------------- kernel/up.c | 2 +- 3 files changed, 19 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeef1a3086d1..f29a7d2b57e1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -769,7 +769,7 @@ struct rq { #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; - struct call_single_data hrtick_csd; + call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; #endif diff --git a/kernel/smp.c b/kernel/smp.c index 3061483cb3ad..81cfca9b4cc3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -28,7 +28,7 @@ enum { }; struct call_function_data { - struct call_single_data __percpu *csd; + call_single_data_t __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; @@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu) free_cpumask_var(cfd->cpumask); return -ENOMEM; } - cfd->csd = alloc_percpu(struct call_single_data); + cfd->csd = alloc_percpu(call_single_data_t); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); @@ -103,12 +103,12 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static __always_inline void csd_lock_wait(struct call_single_data *csd) +static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } -static __always_inline void csd_lock(struct call_single_data *csd) +static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->flags |= CSD_FLAG_LOCK; @@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd) /* * prevent CPU from reordering the above assignment * to ->flags with any subsequent assignments to other - * fields of the specified call_single_data structure: + * fields of the specified call_single_data_t structure: */ smp_wmb(); } -static __always_inline void csd_unlock(struct call_single_data *csd) +static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); @@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd) smp_store_release(&csd->flags, 0); } -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); +static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); /* - * Insert a previously allocated call_single_data element + * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, struct call_single_data *csd, +static int generic_exec_single(int cpu, call_single_data_t *csd, smp_call_func_t func, void *info) { if (cpu == smp_processor_id()) { @@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) { struct llist_head *head; struct llist_node *entry; - struct call_single_data *csd, *csd_next; + call_single_data_t *csd, *csd_next; static bool warned; WARN_ON(!irqs_disabled()); @@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { - struct call_single_data *csd; - struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS }; + call_single_data_t *csd; + call_single_data_t csd_stack = { + .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, + }; int this_cpu; int err; @@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single); * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. */ -int smp_call_function_single_async(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; @@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask, cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { - struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); + call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); csd_lock(csd); if (wait) @@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask, if (wait) { for_each_cpu(cpu, cfd->cpumask) { - struct call_single_data *csd; + call_single_data_t *csd; csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); diff --git a/kernel/up.c b/kernel/up.c index ee81ac9af4ca..42c46bf3e0a5 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { unsigned long flags; -- cgit v1.2.3 From 34d54f3d6917f519693dbe873ee59cd06fb515ed Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 14 Aug 2017 16:07:02 -0400 Subject: locking/pvqspinlock: Relax cmpxchg's to improve performance on some architectures All the locking related cmpxchg's in the following functions are replaced with the _acquire variants: - pv_queued_spin_steal_lock() - trylock_clear_pending() This change should help performance on architectures that use LL/SC. The cmpxchg in pv_kick_node() is replaced with a relaxed version with explicit memory barrier to make sure that it is fully ordered in the writing of next->lock and the reading of pn->state whether the cmpxchg is a success or failure without affecting performance in non-LL/SC architectures. On a 2-socket 12-core 96-thread Power8 system with pvqspinlock explicitly enabled, the performance of a locking microbenchmark with and without this patch on a 4.13-rc4 kernel with Xinhui's PPC qspinlock patch were as follows: # of thread w/o patch with patch % Change ----------- --------- ---------- -------- 8 5054.8 Mop/s 5209.4 Mop/s +3.1% 16 3985.0 Mop/s 4015.0 Mop/s +0.8% 32 2378.2 Mop/s 2396.0 Mop/s +0.7% Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrea Parri Cc: Boqun Feng Cc: Linus Torvalds Cc: Pan Xinhui Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1502741222-24360-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 4ccfcaae5b89..43555681c40b 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) struct __qspinlock *l = (void *)lock; if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { qstat_inc(qstat_pv_lock_stealing, true); return true; } @@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock) /* * The pending bit check in pv_queued_spin_steal_lock() isn't a memory - * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock - * just to be sure that it will get it. + * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the + * lock just to be sure that it will get it. */ static __always_inline int trylock_clear_pending(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; return !READ_ONCE(l->locked) && - (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) - == _Q_PENDING_VAL); + (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL, + _Q_LOCKED_VAL) == _Q_PENDING_VAL); } #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) @@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) */ old = val; new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; - val = atomic_cmpxchg(&lock->val, old, new); + val = atomic_cmpxchg_acquire(&lock->val, old, new); if (val == old) return 1; @@ -362,8 +362,18 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * observe its next->locked value and advance itself. * * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + * + * The write to next->locked in arch_mcs_spin_unlock_contended() + * must be ordered before the read of pn->state in the cmpxchg() + * below for the code to work correctly. To guarantee full ordering + * irrespective of the success or failure of the cmpxchg(), + * a relaxed version with explicit barrier is used. The control + * dependency will order the reading of pn->state before any + * subsequent writes. */ - if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + smp_mb__before_atomic(); + if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed) + != vcpu_halted) return; /* -- cgit v1.2.3 From 51218298a25e6942957c5595f2abf130d47d5df9 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Mon, 21 Aug 2017 00:01:46 +0200 Subject: alarmtimer: Ensure RTC module is not unloaded When registering the rtc device to be used to handle alarm timers, get_device is used to ensure the device doesn't go away but the module can still be unloaded. Call try_module_get to ensure the rtc driver will not go away. Reported-and-tested-by: Michal Simek Signed-off-by: Alexandre Belloni Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Stephen Boyd Link: http://lkml.kernel.org/r/20170820220146.30969-1-alexandre.belloni@free-electrons.com --- kernel/time/alarmtimer.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 73a2b476e59f..ec09ce9a6012 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "posix-timers.h" @@ -103,6 +104,11 @@ static int alarmtimer_rtc_add_device(struct device *dev, spin_lock_irqsave(&rtcdev_lock, flags); if (!rtcdev) { + if (!try_module_get(rtc->owner)) { + spin_unlock_irqrestore(&rtcdev_lock, flags); + return -1; + } + rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); -- cgit v1.2.3 From 065e63f951432068ba89a844fcbff68ea16ee186 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 31 Aug 2017 17:03:47 -0400 Subject: tracing: Only have rmmod clear buffers that its events were active in Currently, when a module event is enabled, when that module is removed, it clears all ring buffers. This is to prevent another module from being loaded and having one of its trace event IDs from reusing a trace event ID of the removed module. This could cause undesirable effects as the trace event of the new module would be using its own processing algorithms to process raw data of another event. To prevent this, when a module is loaded, if any of its events have been used (signified by the WAS_ENABLED event call flag, which is never cleared), all ring buffers are cleared, just in case any one of them contains event data of the removed event. The problem is, there's no reason to clear all ring buffers if only one (or less than all of them) uses one of the events. Instead, only clear the ring buffers that recorded the events of a module that is being removed. To do this, instead of keeping the WAS_ENABLED flag with the trace event call, move it to the per instance (per ring buffer) event file descriptor. The event file descriptor maps each event to a separate ring buffer instance. Then when the module is removed, only the ring buffers that activated one of the module's events get cleared. The rest are not touched. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +++ kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 15 +++++++-------- 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 44004d8aa3b3..30338a835a51 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1702,6 +1702,9 @@ void tracing_reset_all_online_cpus(void) struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->clear_trace) + continue; + tr->clear_trace = false; tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 490ba229931d..fb5d54d0d1b3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -245,6 +245,7 @@ struct trace_array { int stop_count; int clock_id; int nr_topts; + bool clear_trace; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 36132f9280e6..c93540c5df21 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -466,7 +466,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ - call->flags |= TRACE_EVENT_FL_WAS_ENABLED; + set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags); } break; } @@ -2058,6 +2058,10 @@ static void event_remove(struct trace_event_call *call) do_for_each_event_file(tr, file) { if (file->event_call != call) continue; + + if (file->flags & EVENT_FILE_FL_WAS_ENABLED) + tr->clear_trace = true; + ftrace_event_enable_disable(file, 0); /* * The do_for_each_event_file() is @@ -2396,15 +2400,11 @@ static void trace_module_add_events(struct module *mod) static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; - bool clear_trace = false; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { - if (call->mod == mod) { - if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) - clear_trace = true; + if (call->mod == mod) __trace_remove_event_call(call); - } } up_write(&trace_event_sem); @@ -2416,8 +2416,7 @@ static void trace_module_remove_events(struct module *mod) * over from this module may be passed to the new module events and * unexpected results may occur. */ - if (clear_trace) - tracing_reset_all_online_cpus(); + tracing_reset_all_online_cpus(); } static int trace_module_notify(struct notifier_block *self, -- cgit v1.2.3 From 22cf8bc6cb0d48c6d31da8eccbd4201ab9b0c4c6 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 31 Aug 2017 16:15:23 -0700 Subject: kernel/kthread.c: kthread_worker: don't hog the cpu If the worker thread continues getting work, it will hog the cpu and rcu stall complains. Make it a good citizen. This is triggered in a loop block device test. Link: http://lkml.kernel.org/r/5de0a179b3184e1a2183fc503448b0269f24d75b.1503697127.git.shli@fb.com Signed-off-by: Shaohua Li Cc: Petr Mladek Cc: Thomas Gleixner Cc: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 26db528c1d88..1c19edf82427 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -637,6 +637,7 @@ repeat: schedule(); try_to_freeze(); + cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); -- cgit v1.2.3 From 355627f518978b5167256d27492fe0b343aaf2f2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 31 Aug 2017 16:15:26 -0700 Subject: mm, uprobes: fix multiple free of ->uprobes_state.xol_area Commit 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") made it possible to kill a forking task while it is waiting to acquire its ->mmap_sem for write, in dup_mmap(). However, it was overlooked that this introduced an new error path before the new mm_struct's ->uprobes_state.xol_area has been set to NULL after being copied from the old mm_struct by the memcpy in dup_mm(). For a task that has previously hit a uprobe tracepoint, this resulted in the 'struct xol_area' being freed multiple times if the task was killed at just the right time while forking. Fix it by setting ->uprobes_state.xol_area to NULL in mm_init() rather than in uprobe_dup_mmap(). With CONFIG_UPROBE_EVENTS=y, the bug can be reproduced by the same C program given by commit 2b7e8665b4ff ("fork: fix incorrect fput of ->exe_file causing use-after-free"), provided that a uprobe tracepoint has been set on the fork_thread() function. For example: $ gcc reproducer.c -o reproducer -lpthread $ nm reproducer | grep fork_thread 0000000000400719 t fork_thread $ echo "p $PWD/reproducer:0x719" > /sys/kernel/debug/tracing/uprobe_events $ echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable $ ./reproducer Here is the use-after-free reported by KASAN: BUG: KASAN: use-after-free in uprobe_clear_state+0x1c4/0x200 Read of size 8 at addr ffff8800320a8b88 by task reproducer/198 CPU: 1 PID: 198 Comm: reproducer Not tainted 4.13.0-rc7-00015-g36fde05f3fb5 #255 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 Call Trace: dump_stack+0xdb/0x185 print_address_description+0x7e/0x290 kasan_report+0x23b/0x350 __asan_report_load8_noabort+0x19/0x20 uprobe_clear_state+0x1c4/0x200 mmput+0xd6/0x360 do_exit+0x740/0x1670 do_group_exit+0x13f/0x380 get_signal+0x597/0x17d0 do_signal+0x99/0x1df0 exit_to_usermode_loop+0x166/0x1e0 syscall_return_slowpath+0x258/0x2c0 entry_SYSCALL_64_fastpath+0xbc/0xbe ... Allocated by task 199: save_stack_trace+0x1b/0x20 kasan_kmalloc+0xfc/0x180 kmem_cache_alloc_trace+0xf3/0x330 __create_xol_area+0x10f/0x780 uprobe_notify_resume+0x1674/0x2210 exit_to_usermode_loop+0x150/0x1e0 prepare_exit_to_usermode+0x14b/0x180 retint_user+0x8/0x20 Freed by task 199: save_stack_trace+0x1b/0x20 kasan_slab_free+0xa8/0x1a0 kfree+0xba/0x210 uprobe_clear_state+0x151/0x200 mmput+0xd6/0x360 copy_process.part.8+0x605f/0x65d0 _do_fork+0x1a5/0xbd0 SyS_clone+0x19/0x20 do_syscall_64+0x22f/0x660 return_from_SYSCALL_64+0x0/0x7a Note: without KASAN, you may instead see a "Bad page state" message, or simply a general protection fault. Link: http://lkml.kernel.org/r/20170830033303.17927-1-ebiggers3@gmail.com Fixes: 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") Signed-off-by: Eric Biggers Reported-by: Oleg Nesterov Acked-by: Oleg Nesterov Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Konstantin Khlebnikov Cc: Mark Rutland Cc: Michal Hocko Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 -- kernel/fork.c | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e137f98a50c..267f6ef91d97 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - newmm->uprobes_state.xol_area = NULL; - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ diff --git a/kernel/fork.c b/kernel/fork.c index cbbea277b3fb..b7e9e57b71ea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -785,6 +785,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } +static void mm_init_uprobes_state(struct mm_struct *mm) +{ +#ifdef CONFIG_UPROBES + mm->uprobes_state.xol_area = NULL; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -812,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif + mm_init_uprobes_state(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; -- cgit v1.2.3 From 2a5bfe47624bfc835aa0632a0505ba55576c98db Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 31 Aug 2017 17:36:51 -0400 Subject: ftrace: Zero out ftrace hashes when a module is removed When a ftrace filter has a module function, and that module is removed, the filter still has its address as being enabled. This can cause interesting side effects. Nothing dangerous, but unwanted functions can be traced because of it. # cd /sys/kernel/tracing # echo ':mod:snd_seq' > set_ftrace_filter # cat set_ftrace_filter snd_use_lock_sync_helper [snd_seq] check_event_type_and_length [snd_seq] snd_seq_ioctl_pversion [snd_seq] snd_seq_ioctl_client_id [snd_seq] snd_seq_ioctl_get_queue_tempo [snd_seq] update_timestamp_of_queue [snd_seq] snd_seq_ioctl_get_queue_status [snd_seq] snd_seq_set_queue_tempo [snd_seq] snd_seq_ioctl_set_queue_tempo [snd_seq] snd_seq_ioctl_get_queue_timer [snd_seq] seq_free_client1 [snd_seq] [..] # rmmod snd_seq # cat set_ftrace_filter # modprobe kvm # cat set_ftrace_filter kvm_set_cr4 [kvm] kvm_emulate_hypercall [kvm] kvm_set_dr [kvm] This is because removing the snd_seq module after it was being filtered, left the address of the snd_seq functions in the hash. When the kvm module was loaded, some of its functions were loaded at the same address as the snd_seq module. This would enable them to be filtered and traced. Now we don't want to clear the hash completely. That would cause removing a module where only its functions are filtered, to cause the tracing to enable all functions, as an empty filter means to trace all functions. Instead, just set the hash ip address to zero. Then it will never match any function. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 96cea88fa00f..165b149ccb1a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5690,10 +5690,51 @@ static int referenced_filters(struct dyn_ftrace *rec) return cnt; } +static void +clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int i; + + if (ftrace_hash_empty(hash)) + return; + + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + entry = __ftrace_lookup_ip(hash, rec->ip); + /* + * Do not allow this rec to match again. + * Yeah, it may waste some memory, but will be removed + * if/when the hash is modified again. + */ + if (entry) + entry->ip = 0; + } +} + +/* Clear any records from hashs */ +static void clear_mod_from_hashes(struct ftrace_page *pg) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->ops || !tr->ops->func_hash) + continue; + mutex_lock(&tr->ops->func_hash->regex_lock); + clear_mod_from_hash(pg, tr->ops->func_hash->filter_hash); + clear_mod_from_hash(pg, tr->ops->func_hash->notrace_hash); + mutex_unlock(&tr->ops->func_hash->regex_lock); + } + mutex_unlock(&trace_types_lock); +} + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page **last_pg; + struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; int order; @@ -5723,14 +5764,25 @@ void ftrace_release_mod(struct module *mod) ftrace_update_tot_cnt -= pg->index; *last_pg = pg->next; - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - free_pages((unsigned long)pg->records, order); - kfree(pg); + + pg->next = tmp_page; + tmp_page = pg; } else last_pg = &pg->next; } out_unlock: mutex_unlock(&ftrace_lock); + + for (pg = tmp_page; pg; pg = tmp_page) { + + /* Needs to be called outside of ftrace_lock */ + clear_mod_from_hashes(pg); + + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + tmp_page = pg->next; + kfree(pg); + } } void ftrace_module_enable(struct module *mod) -- cgit v1.2.3 From 46320a6acc4fb58f04bcf78c4c942cc43b20f986 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 1 Sep 2017 12:04:09 -0400 Subject: ftrace: Fix selftest goto location on error In the second iteration of trace_selftest_ops(), the error goto label is wrong in the case where trace_selftest_test_global_cnt is off. In the case of error, it leaks the dynamic ops that was allocated. Cc: stable@vger.kernel.org Fixes: 95950c2e ("ftrace: Add self-tests for multiple function trace users") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index cb917cebae29..b17ec642793b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -273,7 +273,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) goto out_free; if (cnt > 1) { if (trace_selftest_test_global_cnt == 0) - goto out; + goto out_free; } if (trace_selftest_test_dyn_cnt == 0) goto out_free; -- cgit v1.2.3 From cc555421bc118edd070f41258d6f55f1ccfc2558 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 31 Aug 2017 23:27:12 -0700 Subject: bpf: Inline LRU map lookup Inline the lru map lookup to save the cost in making calls to bpf_map_lookup_elem() and htab_lru_map_lookup_elem(). Different LRU hash size is tested. The benefit diminishes when the cache miss starts to dominate in the bigger LRU hash. Considering the change is simple, it is still worth to optimize. First column: Size of the LRU hash Second column: Number of lookups/s Before: > for i in $(seq 9 20); do echo "$((2**i+1)): $(./map_perf_test 1024 1 $((2**i+1)) 10000000 | awk '{print $3}')"; done 513: 1132020 1025: 1056826 2049: 1007024 4097: 853298 8193: 742723 16385: 712600 32769: 688142 65537: 677028 131073: 619437 262145: 498770 524289: 316695 1048577: 260038 After: > for i in $(seq 9 20); do echo "$((2**i+1)): $(./map_perf_test 1024 1 $((2**i+1)) 10000000 | awk '{print $3}')"; done 513: 1221851 1025: 1144695 2049: 1049902 4097: 884460 8193: 773731 16385: 729673 32769: 721989 65537: 715530 131073: 671665 262145: 516987 524289: 321125 1048577: 260048 Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d246905f2bb1..682f4543fefa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -514,6 +514,24 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static u32 htab_lru_map_gen_lookup(struct bpf_map *map, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); + *insn++ = BPF_ST_MEM(BPF_B, ret, + offsetof(struct htab_elem, lru_node) + + offsetof(struct bpf_lru_node, ref), + 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct htab_elem, key) + + round_up(map->key_size, 8)); + return insn - insn_buf; +} + /* It is called from the bpf_lru_list when the LRU needs to delete * older elements from the htab. */ @@ -1137,6 +1155,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_lookup_elem = htab_lru_map_lookup_elem, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_gen_lookup = htab_lru_map_gen_lookup, }; /* Called from eBPF program */ -- cgit v1.2.3 From bb9b9f8802212d98e70c63045b1734162945eaa5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 31 Aug 2017 23:27:13 -0700 Subject: bpf: Only set node->ref = 1 if it has not been set This patch writes 'node->ref = 1' only if node->ref is 0. The number of lookups/s for a ~1M entries LRU map increased by ~30% (260097 to 343313). Other writes on 'node->ref = 0' is not changed. In those cases, the same cache line has to be changed anyway. First column: Size of the LRU hash Second column: Number of lookups/s Before: > echo "$((2**20+1)): $(./map_perf_test 1024 1 $((2**20+1)) 10000000 | awk '{print $3}')" 1048577: 260097 After: > echo "$((2**20+1)): $(./map_perf_test 1024 1 $((2**20+1)) 10000000 | awk '{print $3}')" 1048577: 343313 Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/bpf_lru_list.h | 3 ++- kernel/bpf/hashtab.c | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 5c35a98d02bf..7d4f89b7cb84 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -69,7 +69,8 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) /* ref is an approximation on access frequency. It does not * have to be very accurate. Hence, no protection is used. */ - node->ref = 1; + if (!node->ref) + node->ref = 1; } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 682f4543fefa..431126f31ea3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -519,9 +519,14 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map, { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; + const int ref_reg = BPF_REG_1; *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); - *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); + *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, + offsetof(struct htab_elem, lru_node) + + offsetof(struct bpf_lru_node, ref)); + *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); *insn++ = BPF_ST_MEM(BPF_B, ret, offsetof(struct htab_elem, lru_node) + offsetof(struct bpf_lru_node, ref), -- cgit v1.2.3 From edb096e00724f02db5f6ec7900f3bbd465c6c76f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 1 Sep 2017 12:18:28 -0400 Subject: ftrace: Fix memleak when unregistering dynamic ops when tracing disabled If function tracing is disabled by the user via the function-trace option or the proc sysctl file, and a ftrace_ops that was allocated on the heap is unregistered, then the shutdown code exits out without doing the proper clean up. This was found via kmemleak and running the ftrace selftests, as one of the tests unregisters with function tracing disabled. # cat kmemleak unreferenced object 0xffffffffa0020000 (size 4096): comm "swapper/0", pid 1, jiffies 4294668889 (age 569.209s) hex dump (first 32 bytes): 55 ff 74 24 10 55 48 89 e5 ff 74 24 18 55 48 89 U.t$.UH...t$.UH. e5 48 81 ec a8 00 00 00 48 89 44 24 50 48 89 4c .H......H.D$PH.L backtrace: [] kmemleak_vmalloc+0x85/0xf0 [] __vmalloc_node_range+0x281/0x3e0 [] module_alloc+0x4f/0x90 [] arch_ftrace_update_trampoline+0x160/0x420 [] ftrace_startup+0xe7/0x300 [] register_ftrace_function+0x72/0x90 [] trace_selftest_ops+0x204/0x397 [] trace_selftest_startup_function+0x394/0x624 [] run_tracer_selftest+0x15c/0x1d7 [] init_trace_selftests+0x75/0x192 [] do_one_initcall+0x90/0x1e2 [] kernel_init_freeable+0x350/0x3fe [] kernel_init+0x13/0x122 [] ret_from_fork+0x2a/0x40 [] 0xffffffffffffffff Cc: stable@vger.kernel.org Fixes: 12cce594fa ("ftrace/x86: Allow !CONFIG_PREEMPT dynamic ops to use allocated trampolines") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 165b149ccb1a..6abfafd7f173 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2828,13 +2828,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are per_cpu ops, they still need their - * per_cpu field freed. Since, function tracing is + * If these are dynamic or per_cpu ops, they still + * need their data freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_PER_CPU) - per_cpu_ops_free(ops); + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) + goto free_ops; + return 0; } @@ -2900,6 +2901,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (IS_ENABLED(CONFIG_PREEMPT)) synchronize_rcu_tasks(); + free_ops: arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_PER_CPU) -- cgit v1.2.3 From 90a9631cf8c27a2b4702af600cad390fcabb88fb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 1 Sep 2017 11:29:26 -0700 Subject: bpf: sockmap update/simplify memory accounting scheme Instead of tracking wmem_queued and sk_mem_charge by incrementing in the verdict SK_REDIRECT paths and decrementing in the tx work path use skb_set_owner_w and sock_writeable helpers. This solves a few issues with the current code. First, in SK_REDIRECT inc on sk_wmem_queued and sk_mem_charge were being done without the peers sock lock being held. Under stress this can result in accounting errors when tx work and/or multiple verdict decisions are working on the peer psock. Additionally, this cleans up the code because we can rely on the default destructor to decrement memory accounting on kfree_skb. Also this will trigger sk_write_space when space becomes available on kfree_skb() which wasn't happening before and prevent __sk_free from being called until all in-flight packets are completed. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index db0d99d2fe18..f6ffde9c6a68 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -111,7 +111,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) { - struct sock *sock; + struct sock *sk; int rc; /* Because we use per cpu values to feed input from sock redirect @@ -123,16 +123,16 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) rc = smap_verdict_func(psock, skb); switch (rc) { case SK_REDIRECT: - sock = do_sk_redirect_map(); + sk = do_sk_redirect_map(); preempt_enable(); - if (likely(sock)) { - struct smap_psock *peer = smap_psock_sk(sock); + if (likely(sk)) { + struct smap_psock *peer = smap_psock_sk(sk); if (likely(peer && test_bit(SMAP_TX_RUNNING, &peer->state) && - sk_stream_memory_free(peer->sock))) { - peer->sock->sk_wmem_queued += skb->truesize; - sk_mem_charge(peer->sock, skb->truesize); + !sock_flag(sk, SOCK_DEAD) && + sock_writeable(sk))) { + skb_set_owner_w(skb, sk); skb_queue_tail(&peer->rxqueue, skb); schedule_work(&peer->tx_work); break; @@ -282,16 +282,12 @@ start: /* Hard errors break pipe and stop xmit */ smap_report_sk_error(psock, n ? -n : EPIPE); clear_bit(SMAP_TX_RUNNING, &psock->state); - sk_mem_uncharge(psock->sock, skb->truesize); - psock->sock->sk_wmem_queued -= skb->truesize; kfree_skb(skb); goto out; } rem -= n; off += n; } while (rem); - sk_mem_uncharge(psock->sock, skb->truesize); - psock->sock->sk_wmem_queued -= skb->truesize; kfree_skb(skb); } out: -- cgit v1.2.3 From b9047726386bb538cf5e4f52a9f04eb556eebc67 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Wed, 2 Aug 2017 19:51:11 -0700 Subject: ipc: mqueue: Replace timespec with timespec64 struct timespec is not y2038 safe. Replace all uses of timespec by y2038 safe struct timespec64. Even though timespec is used here to represent timeouts, replace these with timespec64 so that it facilitates in verification by creating a y2038 safe kernel image that is free of timespec. The syscall interfaces themselves are not changed as part of the patch. They will be part of a different series. Signed-off-by: Deepa Dinamani Cc: Paul Moore Cc: Richard Guy Briggs Reviewed-by: Richard Guy Briggs Reviewed-by: Arnd Bergmann Acked-by: Paul Moore Signed-off-by: Al Viro --- kernel/audit.h | 2 +- kernel/auditsc.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index b331d9b83f63..9b110ae17ee3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -182,7 +182,7 @@ struct audit_context { mqd_t mqdes; size_t msg_len; unsigned int msg_prio; - struct timespec abs_timeout; + struct timespec64 abs_timeout; } mq_sendrecv; struct { int oflag; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3260ba2312a9..daee2d5bd03a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1235,11 +1235,11 @@ static void show_special(struct audit_context *context, int *call_panic) case AUDIT_MQ_SENDRECV: audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " - "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + "abs_timeout_sec=%lld abs_timeout_nsec=%ld", context->mq_sendrecv.mqdes, context->mq_sendrecv.msg_len, context->mq_sendrecv.msg_prio, - context->mq_sendrecv.abs_timeout.tv_sec, + (long long) context->mq_sendrecv.abs_timeout.tv_sec, context->mq_sendrecv.abs_timeout.tv_nsec); break; case AUDIT_MQ_NOTIFY: @@ -2083,15 +2083,15 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) * */ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, - const struct timespec *abs_timeout) + const struct timespec64 *abs_timeout) { struct audit_context *context = current->audit_context; - struct timespec *p = &context->mq_sendrecv.abs_timeout; + struct timespec64 *p = &context->mq_sendrecv.abs_timeout; if (abs_timeout) - memcpy(p, abs_timeout, sizeof(struct timespec)); + memcpy(p, abs_timeout, sizeof(*p)); else - memset(p, 0, sizeof(struct timespec)); + memset(p, 0, sizeof(*p)); context->mq_sendrecv.mqdes = mqdes; context->mq_sendrecv.msg_len = msg_len; -- cgit v1.2.3 From bdd1d2d3d251c65b74ac4493e08db18971c09240 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 1 Sep 2017 17:39:13 +0200 Subject: fs: fix kernel_read prototype Use proper ssize_t and size_t types for the return value and count argument, move the offset last and make it an in/out argument like all other read/write helpers, and make the buf argument a void pointer to get rid of lots of casts in the callers. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl_binary.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 02e1859f2ca8..243fa1c28b4a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -986,8 +986,9 @@ static ssize_t bin_intvec(struct file *file, size_t length = oldlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; - result = kernel_read(file, 0, buffer, BUFSZ - 1); + result = kernel_read(file, buffer, BUFSZ - 1, &pos); if (result < 0) goto out_kfree; @@ -1057,8 +1058,9 @@ static ssize_t bin_ulongvec(struct file *file, size_t length = oldlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; - result = kernel_read(file, 0, buffer, BUFSZ - 1); + result = kernel_read(file, buffer, BUFSZ - 1, &pos); if (result < 0) goto out_kfree; @@ -1120,8 +1122,9 @@ static ssize_t bin_uuid(struct file *file, if (oldval && oldlen) { char buf[UUID_STRING_LEN + 1]; uuid_t uuid; + loff_t pos = 0; - result = kernel_read(file, 0, buf, sizeof(buf) - 1); + result = kernel_read(file, buf, sizeof(buf) - 1, &pos); if (result < 0) goto out; @@ -1154,8 +1157,9 @@ static ssize_t bin_dn_node_address(struct file *file, char buf[15], *nodep; unsigned long area, node; __le16 dnaddr; + loff_t pos = 0; - result = kernel_read(file, 0, buf, sizeof(buf) - 1); + result = kernel_read(file, buf, sizeof(buf) - 1, &pos); if (result < 0) goto out; -- cgit v1.2.3 From e13ec939e96b13e664bb6cee361cc976a0ee621a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 1 Sep 2017 17:39:14 +0200 Subject: fs: fix kernel_write prototype Make the position an in/out argument like all the other read/write helpers and and make the buf argument a void pointer. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl_binary.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 243fa1c28b4a..58ea8c03662e 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1017,6 +1017,7 @@ static ssize_t bin_intvec(struct file *file, size_t length = newlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; str = buffer; end = str + BUFSZ; @@ -1030,7 +1031,7 @@ static ssize_t bin_intvec(struct file *file, str += scnprintf(str, end - str, "%lu\t", value); } - result = kernel_write(file, buffer, str - buffer, 0); + result = kernel_write(file, buffer, str - buffer, &pos); if (result < 0) goto out_kfree; } @@ -1089,6 +1090,7 @@ static ssize_t bin_ulongvec(struct file *file, size_t length = newlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; str = buffer; end = str + BUFSZ; @@ -1102,7 +1104,7 @@ static ssize_t bin_ulongvec(struct file *file, str += scnprintf(str, end - str, "%lu\t", value); } - result = kernel_write(file, buffer, str - buffer, 0); + result = kernel_write(file, buffer, str - buffer, &pos); if (result < 0) goto out_kfree; } @@ -1192,6 +1194,7 @@ static ssize_t bin_dn_node_address(struct file *file, __le16 dnaddr; char buf[15]; int len; + loff_t pos = 0; result = -EINVAL; if (newlen != sizeof(dnaddr)) @@ -1205,7 +1208,7 @@ static ssize_t bin_dn_node_address(struct file *file, le16_to_cpu(dnaddr) >> 10, le16_to_cpu(dnaddr) & 0x3ff); - result = kernel_write(file, buf, len, 0); + result = kernel_write(file, buf, len, &pos); if (result < 0) goto out; } -- cgit v1.2.3 From 73e18f7c0b3e1432353cdd86672c27cace7e6a7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 1 Sep 2017 17:39:15 +0200 Subject: fs: make the buf argument to __kernel_write a void pointer This matches kernel_read and kernel_write and avoids any need for casts in the callers. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 5b1284370367..5e72af29ab73 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -516,7 +516,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) if (file_start_write_trylock(file)) { /* it's been opened O_APPEND, so position is irrelevant */ loff_t pos = 0; - __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); + __kernel_write(file, &ac, sizeof(acct_t), &pos); file_end_write(file); } out: -- cgit v1.2.3 From e832bf48c8e12f3b39e40fee35c4ea269d685875 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 4 Jul 2017 13:11:43 +0100 Subject: audit: Reduce overhead using a coarse clock Commit 2115bb250f26 ("audit: Use timespec64 to represent audit timestamps") noted that audit timestamps were not y2038 safe and used a 64-bit timestamp. In itself, this makes sense but the conversion was from CURRENT_TIME to ktime_get_real_ts64() which is a heavier call to record an accurate timestamp which is required in some, but not all, cases. The impact is that when auditd is running without any rules that all syscalls have higher overhead. This is visible in the sysbench-thread benchmark as a 11.5% performance hit. That benchmark is dumb as rocks but it's also visible in redis as an 8-10% hit on all operations which is of greater concern. It is somewhat stupid of audit to track syscalls without any rules related to syscalls but that is how it behaves. The overhead can be directly measured with perf comparing 4.9 with 4.12 4.9 7.76% sysbench [kernel.vmlinux] [k] __schedule 7.62% sysbench [kernel.vmlinux] [k] _raw_spin_lock 7.37% sysbench libpthread-2.22.so [.] __lll_lock_elision 7.29% sysbench [kernel.vmlinux] [.] syscall_return_via_sysret 6.59% sysbench [kernel.vmlinux] [k] native_sched_clock 5.21% sysbench libc-2.22.so [.] __sched_yield 4.38% sysbench [kernel.vmlinux] [k] entry_SYSCALL_64 4.28% sysbench [kernel.vmlinux] [k] do_syscall_64 3.49% sysbench libpthread-2.22.so [.] __lll_unlock_elision 3.13% sysbench [kernel.vmlinux] [k] __audit_syscall_exit 2.87% sysbench [kernel.vmlinux] [k] update_curr 2.73% sysbench [kernel.vmlinux] [k] pick_next_task_fair 2.31% sysbench [kernel.vmlinux] [k] syscall_trace_enter 2.20% sysbench [kernel.vmlinux] [k] __audit_syscall_entry ..... 0.00% swapper [kernel.vmlinux] [k] read_tsc 4.12 7.84% sysbench [kernel.vmlinux] [k] __schedule 7.05% sysbench [kernel.vmlinux] [k] _raw_spin_lock 6.57% sysbench libpthread-2.22.so [.] __lll_lock_elision 6.50% sysbench [kernel.vmlinux] [.] syscall_return_via_sysret 5.95% sysbench [kernel.vmlinux] [k] read_tsc 5.71% sysbench [kernel.vmlinux] [k] native_sched_clock 4.78% sysbench libc-2.22.so [.] __sched_yield 4.30% sysbench [kernel.vmlinux] [k] entry_SYSCALL_64 3.94% sysbench [kernel.vmlinux] [k] do_syscall_64 3.37% sysbench libpthread-2.22.so [.] __lll_unlock_elision 3.32% sysbench [kernel.vmlinux] [k] __audit_syscall_exit 2.91% sysbench [kernel.vmlinux] [k] __getnstimeofday64 Note the additional overhead from read_tsc which goes from 0% to 5.95%. This is on a single-socket E3-1230 but similar overheads have been measured on an older machine which the patch also eliminates. The patch in question has no explanation as to why a fully-accurate timestamp is required and is likely an oversight. Using a coarser, but monotically increasing, timestamp the overhead can be eliminated. While it can be worked around by configuring or disabling audit, it's tricky enough to detect that a kernel fix is justified. With this patch, we see the following; sysbenchthread 4.9.0 4.12.0 4.12.0 vanilla vanilla coarse-v1r1 Amean 1 1.49 ( 0.00%) 1.66 ( -11.42%) 1.51 ( -1.34%) Amean 3 1.48 ( 0.00%) 1.65 ( -11.45%) 1.50 ( -0.96%) Amean 5 1.49 ( 0.00%) 1.67 ( -12.31%) 1.51 ( -1.83%) Amean 7 1.49 ( 0.00%) 1.66 ( -11.72%) 1.50 ( -0.67%) Amean 12 1.48 ( 0.00%) 1.65 ( -11.57%) 1.52 ( -2.89%) Amean 16 1.49 ( 0.00%) 1.65 ( -11.13%) 1.51 ( -1.73%) The benchmark is reporting the time required for different thread counts to lock/unlock a private mutex which, while dense, demonstrates the syscall overhead. This is showing that 4.12 took a 11-12% hit but the overhead is almost eliminated by the patch. While the variance is not reported here, it's well within the noise with the patch applied. Signed-off-by: Mel Gorman Acked-by: Arnd Bergmann Acked-by: Deepa Dinamani Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6dd556931739..0cf15c77c5eb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1662,7 +1662,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - ktime_get_real_ts64(t); + *t = current_kernel_time64(); *serial = audit_serial(); } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3260ba2312a9..fd72e11acfb2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1536,7 +1536,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; context->serial = 0; - ktime_get_real_ts64(&context->ctime); + context->ctime = current_kernel_time64(); context->in_syscall = 1; context->current_state = state; context->ppid = 0; -- cgit v1.2.3 From 196a5085592c62ffa4eb739d7ce49c040c2953a1 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 7 Aug 2017 21:44:24 +0800 Subject: audit: update the function comments Update the function comments to match the code. Signed-off-by: Geliang Tang Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditsc.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0cf15c77c5eb..be1c28fd4d57 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1833,7 +1833,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) } /** - * audit_log_hex - convert a buffer to hex and append it to the audit skb + * audit_log_n_hex - convert a buffer to hex and append it to the audit skb * @ab: the audit_buffer * @buf: buffer to convert to hex * @len: length of @buf to be converted diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fd72e11acfb2..aac1a41f82bd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1462,7 +1462,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } /** - * audit_free - free a per-task audit context + * __audit_free - free a per-task audit context * @tsk: task whose audit context block to free * * Called from copy_process and do_exit @@ -1489,7 +1489,7 @@ void __audit_free(struct task_struct *tsk) } /** - * audit_syscall_entry - fill in an audit record at syscall entry + * __audit_syscall_entry - fill in an audit record at syscall entry * @major: major syscall type (function) * @a1: additional syscall register 1 * @a2: additional syscall register 2 @@ -1543,7 +1543,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, } /** - * audit_syscall_exit - deallocate audit context after a system call + * __audit_syscall_exit - deallocate audit context after a system call * @success: success value of the syscall * @return_code: return value of the syscall * @@ -1705,7 +1705,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, } /** - * audit_reusename - fill out filename with info from existing entry + * __audit_reusename - fill out filename with info from existing entry * @uptr: userland ptr to pathname * * Search the audit_names list for the current audit context. If there is an @@ -1730,7 +1730,7 @@ __audit_reusename(const __user char *uptr) } /** - * audit_getname - add a name to the list + * __audit_getname - add a name to the list * @name: name to add * * Add a name to the list of audit names for this context. @@ -2135,7 +2135,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) } /** - * audit_ipc_obj - record audit data for ipc object + * __audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * */ @@ -2151,7 +2151,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) } /** - * audit_ipc_set_perm - record audit data for new ipc permissions + * __audit_ipc_set_perm - record audit data for new ipc permissions * @qbytes: msgq bytes * @uid: msgq user id * @gid: msgq group id @@ -2180,7 +2180,7 @@ void __audit_bprm(struct linux_binprm *bprm) /** - * audit_socketcall - record audit data for sys_socketcall + * __audit_socketcall - record audit data for sys_socketcall * @nargs: number of args, which should not be more than AUDITSC_ARGS. * @args: args array * @@ -2211,7 +2211,7 @@ void __audit_fd_pair(int fd1, int fd2) } /** - * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto + * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto * @len: data length in user space * @a: data address in kernel space * -- cgit v1.2.3 From 3d9622c12c8873911f4cc0ccdabd0362c2fca06b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 5 Sep 2017 11:32:01 -0400 Subject: tracing: Add barrier to trace_printk() buffer nesting modification trace_printk() uses 4 buffers, one for each context (normal, softirq, irq and NMI), such that it does not need to worry about one context preempting the other. There's a nesting counter that gets incremented to figure out which buffer to use. If the context gets preempted by another context which calls trace_printk() it will increment the counter and use the next buffer, and restore the counter when it is finished. The problem is that gcc may optimize the modification of the buffer nesting counter and it may not be incremented in memory before the buffer is used. If this happens, and the context gets interrupted by another context, it could pick the same buffer and corrupt the one that is being used. Compiler barriers need to be added after the nesting variable is incremented and before it is decremented to prevent usage of the context buffers by more than one context at the same time. Cc: Andy Lutomirski Cc: stable@vger.kernel.org Fixes: e2ace00117 ("tracing: Choose static tp_printk buffer by explicit nesting count") Hat-tip-to: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 30338a835a51..78842557eea0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2802,11 +2802,17 @@ static char *get_trace_buf(void) if (!buffer || buffer->nesting >= 4) return NULL; - return &buffer->buffer[buffer->nesting++][0]; + buffer->nesting++; + + /* Interrupts must see nesting incremented before we use the buffer */ + barrier(); + return &buffer->buffer[buffer->nesting][0]; } static void put_trace_buf(void) { + /* Don't let the decrement of nesting leak before this */ + barrier(); this_cpu_dec(trace_percpu_buffer->nesting); } -- cgit v1.2.3 From 7685ab6c58557c6234f3540260195ecbee7fc4b3 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Tue, 5 Sep 2017 13:36:46 +0800 Subject: tracing: Fix clear of RECORDED_TGID flag when disabling trace event When disabling one trace event, the RECORDED_TGID flag in the event file is not correctly cleared. It's clearing RECORDED_CMD flag when it should clear RECORDED_TGID flag. Link: http://lkml.kernel.org/r/1504589806-8425-1-git-send-email-chuhu@redhat.com Cc: Joel Fernandes Cc: stable@vger.kernel.org Fixes: d914ba37d7 ("tracing: Add support for recording tgid of tasks") Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c93540c5df21..87468398b9ed 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -406,7 +406,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { tracing_stop_tgid_record(); - clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); -- cgit v1.2.3 From 96e5ae4e76f1ea950d493f510399b49308bea731 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Sep 2017 22:41:02 -0700 Subject: bpf: fix numa_node validation syzkaller reported crashes in bpf map creation or map update [1] Problem is that nr_node_ids is a signed integer, NUMA_NO_NODE is also an integer, so it is very tempting to declare numa_node as a signed integer. This means the typical test to validate a user provided value : if (numa_node != NUMA_NO_NODE && (numa_node >= nr_node_ids || !node_online(numa_node))) must be written : if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) [1] kernel BUG at mm/slab.c:3256! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 2946 Comm: syzkaller916108 Not tainted 4.13.0-rc7+ #35 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d2bc60c0 task.stack: ffff8801c0c90000 RIP: 0010:____cache_alloc_node+0x1d4/0x1e0 mm/slab.c:3292 RSP: 0018:ffff8801c0c97638 EFLAGS: 00010096 RAX: ffffffffffff8b7b RBX: 0000000001080220 RCX: 0000000000000000 RDX: 00000000ffff8b7b RSI: 0000000001080220 RDI: ffff8801dac00040 RBP: ffff8801c0c976c0 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8801c0c97620 R11: 0000000000000001 R12: ffff8801dac00040 R13: ffff8801dac00040 R14: 0000000000000000 R15: 00000000ffff8b7b FS: 0000000002119940(0000) GS:ffff8801db200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020001fec CR3: 00000001d2980000 CR4: 00000000001406f0 Call Trace: __do_kmalloc_node mm/slab.c:3688 [inline] __kmalloc_node+0x33/0x70 mm/slab.c:3696 kmalloc_node include/linux/slab.h:535 [inline] alloc_htab_elem+0x2a8/0x480 kernel/bpf/hashtab.c:740 htab_map_update_elem+0x740/0xb80 kernel/bpf/hashtab.c:820 map_update_elem kernel/bpf/syscall.c:587 [inline] SYSC_bpf kernel/bpf/syscall.c:1468 [inline] SyS_bpf+0x20c5/0x4c40 kernel/bpf/syscall.c:1443 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x440409 RSP: 002b:00007ffd1f1792b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440409 RDX: 0000000000000020 RSI: 0000000020006000 RDI: 0000000000000002 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000401d70 R13: 0000000000401e00 R14: 0000000000000000 R15: 0000000000000000 Code: 83 c2 01 89 50 18 4c 03 70 08 e8 38 f4 ff ff 4d 85 f6 0f 85 3e ff ff ff 44 89 fe 4c 89 ef e8 94 fb ff ff 49 89 c6 e9 2b ff ff ff <0f> 0b 0f 0b 0f 0b 66 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 RIP: ____cache_alloc_node+0x1d4/0x1e0 mm/slab.c:3292 RSP: ffff8801c0c97638 ---[ end trace d745f355da2e33ce ]--- Kernel panic - not syncing: Fatal exception Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 021a05d9d800..70ad8e220343 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -323,7 +323,8 @@ static int map_create(union bpf_attr *attr) return -EINVAL; if (numa_node != NUMA_NO_NODE && - (numa_node >= nr_node_ids || !node_online(numa_node))) + ((unsigned int)numa_node >= nr_node_ids || + !node_online(numa_node))) return -EINVAL; /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ -- cgit v1.2.3 From 596a7a1d0989c621c3ae49be73a1d1f9de22eb5a Mon Sep 17 00:00:00 2001 From: John Keeping Date: Wed, 6 Sep 2017 10:35:40 +0100 Subject: genirq/msi: Fix populating multiple interrupts On allocating the interrupts routed via a wire-to-MSI bridge, the allocator iterates over the MSI descriptors to build the hierarchy, but fails to use the descriptor interrupt number, and instead uses the base number, generating the wrong IRQ domain mappings. The fix is to use the MSI descriptor interrupt number when setting up the interrupt instead of the base interrupt for the allocation range. The only saving grace is that although the MSI descriptors are allocated in bulk, the wired interrupts are only allocated one by one (so desc->irq == virq) and the bug went unnoticed so far. Fixes: 2145ac9310b60 ("genirq/msi: Add msi_domain_populate_irqs") Signed-off-by: John Keeping Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170906103540.373864a2.john@metanate.com --- kernel/irq/msi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 48eadf416c24..3fa4bd59f569 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -315,11 +315,12 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, ops->set_desc(arg, desc); /* Assumes the domain mutex is held! */ - ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1, + arg); if (ret) break; - irq_set_msi_desc_off(virq, 0, desc); + irq_set_msi_desc_off(desc->irq, 0, desc); } if (ret) { -- cgit v1.2.3 From 65f3975f3584eee2da88b11f06f66e2d39fd30d0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 6 Sep 2017 16:21:50 -0700 Subject: cgroup: revert fa06235b8eb0 ("cgroup: reset css on destruction") Commit fa06235b8eb0 ("cgroup: reset css on destruction") caused css_reset callback to be called from the offlining path. Although it solves the problem mentioned in the commit description ("For instance, memory cgroup needs to reset memory.low, otherwise pages charged to a dead cgroup might never get reclaimed."), generally speaking, it's not correct. An offline cgroup can still be a resource domain, and we shouldn't grant it more resources than it had before deletion. For instance, if an offline memory cgroup has dirty pages, we should still imply i/o limits during writeback. The css_reset callback is designed to return the cgroup state into the original state, that means reset all limits and counters. It's spomething different from the offlining, and we shouldn't use it from the offlining path. Instead, we should adjust necessary settings from the per-controller css_offline callbacks (e.g. reset memory.low). Link: http://lkml.kernel.org/r/20170727130428.28856-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Tejun Heo Acked-by: Johannes Weiner Cc: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index df2e0f14a95d..f64fc967a9ef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4100,9 +4100,6 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; - if (ss->css_reset) - ss->css_reset(css); - if (ss->css_offline) ss->css_offline(css); -- cgit v1.2.3 From ab1b597ee0e4208a1db227bb7b2c9512c8234b48 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Sep 2017 16:24:13 -0700 Subject: mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups devm_memremap_pages() records mapped ranges in pgmap_radix with an entry per section's worth of memory (128MB). The key for each of those entries is a section number. This leads to false positives when devm_memremap_pages() is passed a section-unaligned range as lookups in the misalignment fail to return NULL. We can close this hole by using the pfn as the key for entries in the tree. The number of entries required to describe a remapped range is reduced by leveraging multi-order entries. In practice this approach usually yields just one entry in the tree if the size and starting address are of the same power-of-2 alignment. Previously we always needed nr_entries = mapping_size / 128MB. Link: https://lists.01.org/pipermail/linux-nvdimm/2016-August/006666.html Link: http://lkml.kernel.org/r/150215410565.39310.13767886055248249438.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Toshi Kani Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 9afdc434fb49..066e73c2fcc9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -194,18 +194,41 @@ struct page_map { struct vmem_altmap altmap; }; -static void pgmap_radix_release(struct resource *res) +static unsigned long order_at(struct resource *res, unsigned long pgoff) { - resource_size_t key, align_start, align_size, align_end; + unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; + unsigned long nr_pages, mask; - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); - align_end = align_start + align_size - 1; + nr_pages = PHYS_PFN(resource_size(res)); + if (nr_pages == pgoff) + return ULONG_MAX; + + /* + * What is the largest aligned power-of-2 range available from + * this resource pgoff to the end of the resource range, + * considering the alignment of the current pgoff? + */ + mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff); + if (!mask) + return ULONG_MAX; + + return find_first_bit(&mask, BITS_PER_LONG); +} + +#define foreach_order_pgoff(res, order, pgoff) \ + for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ + pgoff += 1UL << order, order = order_at((res), pgoff)) + +static void pgmap_radix_release(struct resource *res) +{ + unsigned long pgoff, order; mutex_lock(&pgmap_lock); - for (key = res->start; key <= res->end; key += SECTION_SIZE) - radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); + foreach_order_pgoff(res, order, pgoff) + radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); mutex_unlock(&pgmap_lock); + + synchronize_rcu(); } static unsigned long pfn_first(struct page_map *page_map) @@ -268,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) WARN_ON_ONCE(!rcu_read_lock_held()); - page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); + page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); return page_map ? &page_map->pgmap : NULL; } @@ -293,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { - resource_size_t key, align_start, align_size, align_end; + resource_size_t align_start, align_size, align_end; + unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; - unsigned long pfn; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -337,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; align_end = align_start + align_size - 1; - for (key = align_start; key <= align_end; key += SECTION_SIZE) { + + foreach_order_pgoff(res, order, pgoff) { struct dev_pagemap *dup; rcu_read_lock(); - dup = find_dev_pagemap(key); + dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff)); rcu_read_unlock(); if (dup) { dev_err(dev, "%s: %pr collides with mapping for %s\n", @@ -349,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, error = -EBUSY; break; } - error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, - page_map); + error = __radix_tree_insert(&pgmap_radix, + PHYS_PFN(res->start) + pgoff, order, page_map); if (error) { dev_err(dev, "%s: failed: %d\n", __func__, error); break; -- cgit v1.2.3 From da99ecf117fce6570bd3989263d68ee0007e1249 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:24:53 -0700 Subject: mm: replace TIF_MEMDIE checks by tsk_is_oom_victim TIF_MEMDIE is set only to the tasks whick were either directly selected by the OOM killer or passed through mark_oom_victim from the allocator path. tsk_is_oom_victim is more generic and allows to identify all tasks (threads) which share the mm with the oom victim. Please note that the freezer still needs to check TIF_MEMDIE because we cannot thaw tasks which do not participage in oom_victims counting otherwise a !TIF_MEMDIE task could interfere after oom_disbale returns. Link: http://lkml.kernel.org/r/20170810075019.28998-3-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Mel Gorman Cc: Tetsuo Handa Cc: David Rientjes Cc: Johannes Weiner Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cpuset.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2f4039bafebb..e7485786db9b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -2500,12 +2501,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * If we're in interrupt, yes, we can always allocate. If @node is set in * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, - * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. + * yes. If current has access to memory reserves as an oom victim, yes. * Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed as is marked TIF_MEMDIE. + * unless the task has been OOM killed. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -2528,7 +2529,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok - * TIF_MEMDIE - any node ok + * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -2546,7 +2547,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) + if (unlikely(tsk_is_oom_victim(current))) return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return false; -- cgit v1.2.3 From 212925802454672e6cd2949a727f5e2c1377bf06 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 6 Sep 2017 16:25:00 -0700 Subject: mm: oom: let oom_reap_task and exit_mmap run concurrently This is purely required because exit_aio() may block and exit_mmap() may never start, if the oom_reap_task cannot start running on a mm with mm_users == 0. At the same time if the OOM reaper doesn't wait at all for the memory of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it would generate a spurious OOM kill. If it wasn't because of the exit_aio or similar blocking functions in the last mmput, it would be enough to change the oom_reap_task() in the case it finds mm_users == 0, to wait for a timeout or to wait for __mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the problem here so the concurrency of exit_mmap and oom_reap_task is apparently warranted. It's a non standard runtime, exit_mmap() runs without mmap_sem, and oom_reap_task runs with the mmap_sem for reading as usual (kind of MADV_DONTNEED). The race between the two is solved with a combination of tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP (serialized by a dummy down_write/up_write cycle on the same lines of the ksm_exit method). If the oom_reap_task() may be running concurrently during exit_mmap, exit_mmap will wait it to finish in down_write (before taking down mm structures that would make the oom_reap_task fail with use after free). If exit_mmap comes first, oom_reap_task() will skip the mm if MMF_OOM_SKIP is already set and in turn all memory is already freed and furthermore the mm data structures may already have been taken down by free_pgtables. [aarcange@redhat.com: incremental one liner] Link: http://lkml.kernel.org/r/20170726164319.GC29716@redhat.com [rientjes@google.com: remove unused mmput_async] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708141733130.50317@chino.kir.corp.google.com [aarcange@redhat.com: microoptimization] Link: http://lkml.kernel.org/r/20170817171240.GB5066@redhat.com Link: http://lkml.kernel.org/r/20170726162912.GA29716@redhat.com Fixes: 26db62f179d1 ("oom: keep mm of the killed task available") Signed-off-by: Andrea Arcangeli Signed-off-by: David Rientjes Reported-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4e5345c07344..7ed64600da6c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -922,7 +922,6 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); - set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } @@ -938,22 +937,6 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#ifdef CONFIG_MMU -static void mmput_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmput(mm); -} - -void mmput_async(struct mm_struct *mm) -{ - if (atomic_dec_and_test(&mm->mm_users)) { - INIT_WORK(&mm->async_put_work, mmput_async_fn); - schedule_work(&mm->async_put_work); - } -} -#endif - /** * set_mm_exe_file - change a reference to the mm's executable file * -- cgit v1.2.3 From d2cd9ede6e193dd7d88b6d27399e96229a551b19 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 6 Sep 2017 16:25:15 -0700 Subject: mm,fork: introduce MADV_WIPEONFORK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce MADV_WIPEONFORK semantics, which result in a VMA being empty in the child process after fork. This differs from MADV_DONTFORK in one important way. If a child process accesses memory that was MADV_WIPEONFORK, it will get zeroes. The address ranges are still valid, they are just empty. If a child process accesses memory that was MADV_DONTFORK, it will get a segmentation fault, since those address ranges are no longer valid in the child after fork. Since MADV_DONTFORK also seems to be used to allow very large programs to fork in systems with strict memory overcommit restrictions, changing the semantics of MADV_DONTFORK might break existing programs. MADV_WIPEONFORK only works on private, anonymous VMAs. The use case is libraries that store or cache information, and want to know that they need to regenerate it in the child process after fork. Examples of this would be: - systemd/pulseaudio API checks (fail after fork) (replacing a getpid check, which is too slow without a PID cache) - PKCS#11 API reinitialization check (mandated by specification) - glibc's upcoming PRNG (reseed after fork) - OpenSSL PRNG (reseed after fork) The security benefits of a forking server having a re-inialized PRNG in every child process are pretty obvious. However, due to libraries having all kinds of internal state, and programs getting compiled with many different versions of each library, it is unreasonable to expect calling programs to re-initialize everything manually after fork. A further complication is the proliferation of clone flags, programs bypassing glibc's functions to call clone directly, and programs calling unshare, causing the glibc pthread_atfork hook to not get called. It would be better to have the kernel take care of this automatically. The patch also adds MADV_KEEPONFORK, to undo the effects of a prior MADV_WIPEONFORK. This is similar to the OpenBSD minherit syscall with MAP_INHERIT_ZERO: https://man.openbsd.org/minherit.2 [akpm@linux-foundation.org: numerically order arch/parisc/include/uapi/asm/mman.h #defines] Link: http://lkml.kernel.org/r/20170811212829.29186-3-riel@redhat.com Signed-off-by: Rik van Riel Reported-by: Florian Weimer Reported-by: Colm MacCártaigh Reviewed-by: Mike Kravetz Cc: "H. Peter Anvin" Cc: "Kirill A. Shutemov" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Helge Deller Cc: Kees Cook Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Will Drewry Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7ed64600da6c..24a4c0be80d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = dup_userfaultfd(tmp, &uf); if (retval) goto fail_nomem_anon_vma_fork; - if (anon_vma_fork(tmp, mpnt)) + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; @@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); -- cgit v1.2.3 From 170b3b1050e28d1ba0700e262f0899ffa4fccc52 Mon Sep 17 00:00:00 2001 From: Baohong Liu Date: Tue, 5 Sep 2017 16:57:19 -0500 Subject: tracing: Apply trace_clock changes to instance max buffer Currently trace_clock timestamps are applied to both regular and max buffers only for global trace. For instance trace, trace_clock timestamps are applied only to regular buffer. But, regular and max buffers can be swapped, for example, following a snapshot. So, for instance trace, bad timestamps can be seen following a snapshot. Let's apply trace_clock timestamps to instance max buffer as well. Link: http://lkml.kernel.org/r/ebdb168d0be042dcdf51f81e696b17fabe3609c1.1504642143.git.tom.zanussi@linux.intel.com Cc: stable@vger.kernel.org Fixes: 277ba0446 ("tracing: Add interface to allow multiple trace buffers") Signed-off-by: Baohong Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78842557eea0..5360b7aec57a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6229,7 +6229,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif -- cgit v1.2.3 From a731ebe6f17bff9e7ca12ef227f9da4d5bdf8425 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Sep 2017 12:51:31 +0200 Subject: sched/fair: Fix wake_affine_llc() balancing rules Chris Wilson reported that the SMT balance rules got the +1 on the wrong side, resulting in a bias towards the current LLC; which the load-balancer would then try and undo. Reported-by: Chris Wilson Tested-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: 90001d67be2f ("sched/fair: Fix wake_affine() for !NUMA_BALANCING") Link: http://lkml.kernel.org/r/20170906105131.gqjmaextmn3u6tj2@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8d5868771cb3..9dd2ce1e5ca2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5435,7 +5435,7 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p, return false; /* if this cache has capacity, come here */ - if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1) + if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) return true; /* -- cgit v1.2.3 From 12ac1d0f6c3e95732d144ffa65c8b20fbd9aa462 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Sep 2017 10:12:20 +0200 Subject: genirq: Make sparse_irq_lock protect what it should protect for_each_active_irq() iterates the sparse irq allocation bitmap. The caller must hold sparse_irq_lock. Several code pathes expect that an active bit in the sparse bitmap also has a valid interrupt descriptor. Unfortunately that's not true. The (de)allocation is a two step process, which holds the sparse_irq_lock only across the queue/remove from the radix tree and the set/clear in the allocation bitmap. If a iteration locks sparse_irq_lock between the two steps, then it might see an active bit but the corresponding irq descriptor is NULL. If that is dereferenced unconditionally, then the kernel oopses. Of course, all iterator sites could be audited and fixed, but.... There is no reason why the sparse_irq_lock needs to be dropped between the two steps, in fact the code becomes simpler when the mutex is held across both and the semantics become more straight forward, so future problems of missing NULL pointer checks in the iteration are avoided and all existing sites are fixed in one go. Expand the lock held sections so both operations are covered and the bitmap and the radixtree are in sync. Fixes: a05a900a51c7 ("genirq: Make sparse_lock a mutex") Reported-and-tested-by: Huang Ying Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/irq/irqdesc.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 73be2b3909bd..82afb7ed369f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -421,10 +421,8 @@ static void free_desc(unsigned int irq) * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ - mutex_lock(&sparse_irq_lock); kobject_del(&desc->kobj); delete_irq_desc(irq); - mutex_unlock(&sparse_irq_lock); /* * We free the descriptor, masks and stat fields via RCU. That @@ -462,20 +460,15 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; - mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); - mutex_unlock(&sparse_irq_lock); } + bitmap_set(allocated_irqs, start, cnt); return start; err: for (i--; i >= 0; i--) free_desc(start + i); - - mutex_lock(&sparse_irq_lock); - bitmap_clear(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); return -ENOMEM; } @@ -575,6 +568,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, desc->owner = owner; } + bitmap_set(allocated_irqs, start, cnt); return start; } @@ -670,10 +664,10 @@ void irq_free_descs(unsigned int from, unsigned int cnt) if (from >= nr_irqs || (from + cnt) > nr_irqs) return; + mutex_lock(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); - mutex_lock(&sparse_irq_lock); bitmap_clear(allocated_irqs, from, cnt); mutex_unlock(&sparse_irq_lock); } @@ -720,19 +714,15 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, from, cnt, 0); ret = -EEXIST; if (irq >=0 && start != irq) - goto err; + goto unlock; if (start + cnt > nr_irqs) { ret = irq_expand_nr_irqs(start + cnt); if (ret) - goto err; + goto unlock; } - - bitmap_set(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, affinity, owner); - -err: + ret = alloc_descs(start, cnt, node, affinity, owner); +unlock: mutex_unlock(&sparse_irq_lock); return ret; } -- cgit v1.2.3 From 50e76632339d4655859523a39249dd95ee5e93e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 11:13:38 +0200 Subject: sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs Cpusets vs. suspend-resume is _completely_ broken. And it got noticed because it now resulted in non-cpuset usage breaking too. On suspend cpuset_cpu_inactive() doesn't call into cpuset_update_active_cpus() because it doesn't want to move tasks about, there is no need, all tasks are frozen and won't run again until after we've resumed everything. But this means that when we finally do call into cpuset_update_active_cpus() after resuming the last frozen cpu in cpuset_cpu_active(), the top_cpuset will not have any difference with the cpu_active_mask and this it will not in fact do _anything_. So the cpuset configuration will not be restored. This was largely hidden because we would unconditionally create identity domains and mobile users would not in fact use cpusets much. And servers what do use cpusets tend to not suspend-resume much. An addition problem is that we'd not in fact wait for the cpuset work to finish before resuming the tasks, allowing spurious migrations outside of the specified domains. Fix the rebuild by introducing cpuset_force_rebuild() and fix the ordering with cpuset_wait_for_hotplug(). Reported-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Tejun Heo Cc: Thomas Gleixner Fixes: deb7aa308ea2 ("cpuset: reorganize CPU / memory hotplug handling") Link: http://lkml.kernel.org/r/20170907091338.orwxrqkbfkki3c24@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 16 +++++++++++++++- kernel/power/process.c | 5 ++++- kernel/sched/core.c | 7 +++---- 3 files changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2f4039bafebb..0513ee39698b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2267,6 +2267,13 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2341,8 +2348,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) + if (cpus_updated || force_rebuild) { + force_rebuild = false; rebuild_sched_domains(); + } } void cpuset_update_active_cpus(void) @@ -2355,6 +2364,11 @@ void cpuset_update_active_cpus(void) schedule_work(&cpuset_hotplug_work); } +void cpuset_wait_for_hotplug(void) +{ + flush_work(&cpuset_hotplug_work); +} + /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. diff --git a/kernel/power/process.c b/kernel/power/process.c index 78672d324a6e..50f25cb370c6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -20,8 +20,9 @@ #include #include #include +#include -/* +/* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; @@ -202,6 +203,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d2c7ff9ba98..136a76d80dbf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5556,16 +5556,15 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - num_cpus_frozen--; - if (likely(num_cpus_frozen)) { - partition_sched_domains(1, NULL, NULL); + partition_sched_domains(1, NULL, NULL); + if (--num_cpus_frozen) return; - } /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ + cpuset_force_rebuild(); } cpuset_update_active_cpus(); } -- cgit v1.2.3 From 133ff0eac95b7dc6edf89dc51bd139a0630bbae7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Fri, 8 Sep 2017 16:11:23 -0700 Subject: mm/hmm: heterogeneous memory management (HMM for short) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM provides 3 separate types of functionality: - Mirroring: synchronize CPU page table and device page table - Device memory: allocating struct page for device memory - Migration: migrating regular memory to device memory This patch introduces some common helpers and definitions to all of those 3 functionality. Link: http://lkml.kernel.org/r/20170817000548.32038-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Evgeny Baskakov Signed-off-by: John Hubbard Signed-off-by: Mark Hairgrove Signed-off-by: Sherry Cheung Signed-off-by: Subhash Gutti Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 24a4c0be80d5..2ccbbbfcb7b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -824,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); + hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; @@ -903,6 +905,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); -- cgit v1.2.3 From 5042db43cc26f51eed51c56192e2c2317e44315f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Fri, 8 Sep 2017 16:11:43 -0700 Subject: mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM (heterogeneous memory management) need struct page to support migration from system main memory to device memory. Reasons for HMM and migration to device memory is explained with HMM core patch. This patch deals with device memory that is un-addressable memory (ie CPU can not access it). Hence we do not want those struct page to be manage like regular memory. That is why we extend ZONE_DEVICE to support different types of memory. A persistent memory type is define for existing user of ZONE_DEVICE and a new device un-addressable type is added for the un-addressable memory type. There is a clear separation between what is expected from each memory type and existing user of ZONE_DEVICE are un-affected by new requirement and new use of the un-addressable type. All specific code path are protect with test against the memory type. Because memory is un-addressable we use a new special swap type for when a page is migrated to device memory (this reduces the number of maximum swap file). The main two additions beside memory type to ZONE_DEVICE is two callbacks. First one, page_free() is call whenever page refcount reach 1 (which means the page is free as ZONE_DEVICE page never reach a refcount of 0). This allow device driver to manage its memory and associated struct page. The second callback page_fault() happens when there is a CPU access to an address that is back by a device page (which are un-addressable by the CPU). This callback is responsible to migrate the page back to system main memory. Device driver can not block migration back to system memory, HMM make sure that such page can not be pin into device memory. If device is in some error condition and can not migrate memory back then a CPU page fault to device memory should end with SIGBUS. [arnd@arndb.de: fix warning] Link: http://lkml.kernel.org/r/20170823133213.712917-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20170817000548.32038-8-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Arnd Bergmann Acked-by: Dan Williams Cc: Ross Zwisler Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Michal Hocko Cc: Paul E. McKenney Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 066e73c2fcc9..f1d1e0dfe8b4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #ifndef ioremap_cache /* temporary while we convert existing ioremap_cache users to memremap */ @@ -219,6 +221,34 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff) for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ pgoff += 1UL << order, order = order_at((res), pgoff)) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +int device_private_entry_fault(struct vm_area_struct *vma, + unsigned long addr, + swp_entry_t entry, + unsigned int flags, + pmd_t *pmdp) +{ + struct page *page = device_private_entry_to_page(entry); + + /* + * The page_fault() callback must migrate page back to system memory + * so that CPU can access it. This might fail for various reasons + * (device issue, device was unsafely unplugged, ...). When such + * error conditions happen, the callback must return VM_FAULT_SIGBUS. + * + * Note that because memory cgroup charges are accounted to the device + * memory, this should never fail because of memory restrictions (but + * allocation of regular system page might still fail because we are + * out of memory). + * + * There is a more in-depth description of what that callback can and + * cannot do, in include/linux/memremap.h + */ + return page->pgmap->page_fault(vma, addr, page, flags, pmdp); +} +EXPORT_SYMBOL(device_private_entry_fault); +#endif /* CONFIG_DEVICE_PRIVATE */ + static void pgmap_radix_release(struct resource *res) { unsigned long pgoff, order; @@ -356,6 +386,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, } pgmap->ref = ref; pgmap->res = &page_map->res; + pgmap->type = MEMORY_DEVICE_HOST; + pgmap->page_fault = NULL; + pgmap->page_free = NULL; + pgmap->data = NULL; mutex_lock(&pgmap_lock); error = 0; -- cgit v1.2.3 From 7b2d55d2c8961ae9d456d3133f4ae2f0fbd3e14f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Fri, 8 Sep 2017 16:11:46 -0700 Subject: mm/ZONE_DEVICE: special case put_page() for device private pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A ZONE_DEVICE page that reach a refcount of 1 is free ie no longer have any user. For device private pages this is important to catch and thus we need to special case put_page() for this. Link: http://lkml.kernel.org/r/20170817000548.32038-9-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Kirill A. Shutemov Cc: Dan Williams Cc: Ross Zwisler Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Mark Hairgrove Cc: Michal Hocko Cc: Paul E. McKenney Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index f1d1e0dfe8b4..1403cf16fa61 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,7 +11,6 @@ * General Public License for more details. */ #include -#include #include #include #include @@ -500,3 +499,27 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } #endif /* CONFIG_ZONE_DEVICE */ + + +#ifdef CONFIG_DEVICE_PRIVATE +void put_zone_device_private_page(struct page *page) +{ + int count = page_ref_dec_return(page); + + /* + * If refcount is 1 then page is freed and refcount is stable as nobody + * holds a reference on the page. + */ + if (count == 1) { + /* Clear Active bit in case of parallel mark_page_accessed */ + __ClearPageActive(page); + __ClearPageWaiters(page); + + page->mapping = NULL; + + page->pgmap->page_free(page, page->pgmap->data); + } else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_zone_device_private_page); +#endif /* CONFIG_DEVICE_PRIVATE */ -- cgit v1.2.3 From c733a82874a79261866a4178edbb608847df4879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Fri, 8 Sep 2017 16:11:54 -0700 Subject: mm/memcontrol: support MEMORY_DEVICE_PRIVATE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM pages (private or public device pages) are ZONE_DEVICE page and thus need special handling when it comes to lru or refcount. This patch make sure that memcontrol properly handle those when it face them. Those pages are use like regular pages in a process address space either as anonymous page or as file back page. So from memcg point of view we want to handle them like regular page for now at least. Link: http://lkml.kernel.org/r/20170817000548.32038-11-jglisse@redhat.com Signed-off-by: Jérôme Glisse Acked-by: Balbir Singh Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Aneesh Kumar Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Evgeny Baskakov Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Sherry Cheung Cc: Subhash Gutti Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 1403cf16fa61..ea0e18a2a5f2 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -516,6 +516,7 @@ void put_zone_device_private_page(struct page *page) __ClearPageWaiters(page); page->mapping = NULL; + mem_cgroup_uncharge(page); page->pgmap->page_free(page, page->pgmap->data); } else if (!count) -- cgit v1.2.3 From df6ad69838fc9dcdbee0dcf2fc2c6f1113f8d609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Fri, 8 Sep 2017 16:12:24 -0700 Subject: mm/device-public-memory: device memory cache coherent with CPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Platform with advance system bus (like CAPI or CCIX) allow device memory to be accessible from CPU in a cache coherent fashion. Add a new type of ZONE_DEVICE to represent such memory. The use case are the same as for the un-addressable device memory but without all the corners cases. Link: http://lkml.kernel.org/r/20170817000548.32038-19-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Aneesh Kumar Cc: Paul E. McKenney Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: Ross Zwisler Cc: Balbir Singh Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Michal Hocko Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index ea0e18a2a5f2..6bcbfbf1a8fd 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -501,8 +501,8 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) #endif /* CONFIG_ZONE_DEVICE */ -#ifdef CONFIG_DEVICE_PRIVATE -void put_zone_device_private_page(struct page *page) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +void put_zone_device_private_or_public_page(struct page *page) { int count = page_ref_dec_return(page); @@ -522,5 +522,5 @@ void put_zone_device_private_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL(put_zone_device_private_page); -#endif /* CONFIG_DEVICE_PRIVATE */ +EXPORT_SYMBOL(put_zone_device_private_or_public_page); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -- cgit v1.2.3 From 9b130ad5bb8255ee8534d92d67e12b2a4887eacb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Sep 2017 16:14:18 -0700 Subject: treewide: make "nr_cpu_ids" unsigned First, number of CPUs can't be negative number. Second, different signnnedness leads to suboptimal code in the following cases: 1) kmalloc(nr_cpu_ids * sizeof(X)); "int" has to be sign extended to size_t. 2) while (loff_t *pos < nr_cpu_ids) MOVSXD is 1 byte longed than the same MOV. Other cases exist as well. Basically compiler is told that nr_cpu_ids can't be negative which can't be deduced if it is "int". Code savings on allyesconfig kernel: -3KB add/remove: 0/0 grow/shrink: 25/264 up/down: 261/-3631 (-3370) function old new delta coretemp_cpu_online 450 512 +62 rcu_init_one 1234 1272 +38 pci_device_probe 374 399 +25 ... pgdat_reclaimable_pages 628 556 -72 select_fallback_rq 446 369 -77 task_numa_find_cpu 1923 1807 -116 Link: http://lkml.kernel.org/r/20170819114959.GA30580@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 2 +- kernel/sched/topology.c | 2 +- kernel/smp.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84fe96641b2e..1250e4bd4b85 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4091,7 +4091,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 55bde94b9572..e012b9be777e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_fanout_leaf != RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6f7b43982f73..5d0062cc10cb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -473,7 +473,7 @@ static int __init isolated_cpu_setup(char *str) alloc_bootmem_cpumask_var(&cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map); if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); return 0; } return 1; diff --git a/kernel/smp.c b/kernel/smp.c index 81cfca9b4cc3..c94dd85c8d41 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -550,7 +550,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); /* Setup number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; +unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d56123cdcc89..b8f1f54731af 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { - max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { pr_warn("Warning: could not register graph trace events\n"); -- cgit v1.2.3 From bfb068892d30dcf0a32b89302fe293347adeaaaa Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:14:55 -0700 Subject: sched/fair: replace cfs_rq->rb_leftmost ... with the generic rbtree flavor instead. No changes in semantics whatsoever. Link: http://lkml.kernel.org/r/20170719014603.19029-8-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/debug.c | 2 +- kernel/sched/fair.c | 39 +++++++++++++-------------------------- kernel/sched/sched.h | 3 +-- 3 files changed, 15 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4a23bbc3111b..8e536d963652 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -530,7 +530,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) + if (rb_first_cached(&cfs_rq->tasks_timeline)) MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8bc0a883d190..a5d83ed8dd82 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -513,6 +513,7 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); u64 vruntime = cfs_rq->min_vruntime; @@ -523,10 +524,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); + if (leftmost) { /* non-empty tree */ + struct sched_entity *se; + se = rb_entry(leftmost, struct sched_entity, run_node); if (!curr) vruntime = se->vruntime; @@ -547,10 +547,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - int leftmost = 1; + bool leftmost = true; /* * Find the right place in the rbtree: @@ -566,36 +566,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + rb_insert_color_cached(&se->run_node, + &cfs_rq->tasks_timeline, leftmost); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) { - struct rb_node *next_node; - - next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; - } - - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { - struct rb_node *left = cfs_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); if (!left) return NULL; @@ -616,7 +603,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); if (!last) return NULL; @@ -9312,7 +9299,7 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { - cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6ed7962dc896..c30c57563dbc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -426,8 +426,7 @@ struct cfs_rq { u64 min_vruntime_copy; #endif - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; + struct rb_root_cached tasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. -- cgit v1.2.3 From 2161573ecd6931565936cb66793b2d2bf805c088 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:14:58 -0700 Subject: sched/deadline: replace earliest dl and rq leftmost caching ... with the generic rbtree flavor instead. No changes in semantics whatsoever. Link: http://lkml.kernel.org/r/20170719014603.19029-9-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/deadline.c | 50 +++++++++++++++++++------------------------------ kernel/sched/sched.h | 6 ++---- 2 files changed, 21 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9e38df7649f4..0191ec7667c3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -296,7 +296,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; - return dl_rq->rb_leftmost == &dl_se->rb_node; + return dl_rq->root.rb_leftmost == &dl_se->rb_node; } void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) @@ -320,7 +320,7 @@ void init_dl_bw(struct dl_bw *dl_b) void init_dl_rq(struct dl_rq *dl_rq) { - dl_rq->rb_root = RB_ROOT; + dl_rq->root = RB_ROOT_CACHED; #ifdef CONFIG_SMP /* zero means no -deadline tasks */ @@ -328,7 +328,7 @@ void init_dl_rq(struct dl_rq *dl_rq) dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; - dl_rq->pushable_dl_tasks_root = RB_ROOT; + dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else init_dl_bw(&dl_rq->dl_bw); #endif @@ -410,10 +410,10 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; struct rb_node *parent = NULL; struct task_struct *entry; - int leftmost = 1; + bool leftmost = true; BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); @@ -425,17 +425,16 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) link = &parent->rb_left; else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) { - dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + if (leftmost) dl_rq->earliest_dl.next = p->dl.deadline; - } rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_insert_color_cached(&p->pushable_dl_tasks, + &dl_rq->pushable_dl_tasks_root, leftmost); } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -445,24 +444,23 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { struct rb_node *next_node; next_node = rb_next(&p->pushable_dl_tasks); - dl_rq->pushable_dl_tasks_leftmost = next_node; if (next_node) { dl_rq->earliest_dl.next = rb_entry(next_node, struct task_struct, pushable_dl_tasks)->dl.deadline; } } - rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } static inline int has_pushable_dl_tasks(struct rq *rq) { - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); } static int push_dl_task(struct rq *rq); @@ -1266,7 +1264,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { - struct rb_node *leftmost = dl_rq->rb_leftmost; + struct rb_node *leftmost = dl_rq->root.rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); @@ -1313,7 +1311,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node **link = &dl_rq->root.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_dl_entity *entry; int leftmost = 1; @@ -1331,11 +1329,8 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) } } - if (leftmost) - dl_rq->rb_leftmost = &dl_se->rb_node; - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); inc_dl_tasks(dl_se, dl_rq); } @@ -1347,14 +1342,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) if (RB_EMPTY_NODE(&dl_se->rb_node)) return; - if (dl_rq->rb_leftmost == &dl_se->rb_node) { - struct rb_node *next_node; - - next_node = rb_next(&dl_se->rb_node); - dl_rq->rb_leftmost = next_node; - } - - rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + rb_erase_cached(&dl_se->rb_node, &dl_rq->root); RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); @@ -1647,7 +1635,7 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, struct dl_rq *dl_rq) { - struct rb_node *left = dl_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; @@ -1771,7 +1759,7 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost; + struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; if (!has_pushable_dl_tasks(rq)) @@ -1945,7 +1933,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, struct task_struct, pushable_dl_tasks); BUG_ON(rq->cpu != task_cpu(p)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c30c57563dbc..746ac78ff492 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -549,8 +549,7 @@ struct rt_rq { /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ - struct rb_root rb_root; - struct rb_node *rb_leftmost; + struct rb_root_cached root; unsigned long dl_nr_running; @@ -574,8 +573,7 @@ struct dl_rq { * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element. */ - struct rb_root pushable_dl_tasks_root; - struct rb_node *pushable_dl_tasks_leftmost; + struct rb_root_cached pushable_dl_tasks_root; #else struct dl_bw dl_bw; #endif -- cgit v1.2.3 From a23ba907d5e65d6aeea3e59c82fda9cd206a7aad Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:15:01 -0700 Subject: locking/rtmutex: replace top-waiter and pi_waiters leftmost caching ... with the generic rbtree flavor instead. No changes in semantics whatsoever. Link: http://lkml.kernel.org/r/20170719014603.19029-10-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +-- kernel/locking/rtmutex-debug.c | 2 +- kernel/locking/rtmutex.c | 35 +++++++++++------------------------ kernel/locking/rtmutex_common.h | 12 ++++++------ 4 files changed, 19 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2ccbbbfcb7b8..6f1b0af00bda 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1462,8 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - p->pi_waiters = RB_ROOT; - p->pi_waiters_leftmost = NULL; + p->pi_waiters = RB_ROOT_CACHED; p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index ac35e648b0e5..f4a74e78d467 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 649dc9d3951a..6f3dba6e4e9e 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &lock->waiters.rb_node; + struct rb_node **link = &lock->waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - lock->waiters_leftmost = &waiter->tree_entry; - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color(&waiter->tree_entry, &lock->waiters); + rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); } static void @@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->tree_entry)) return; - if (lock->waiters_leftmost == &waiter->tree_entry) - lock->waiters_leftmost = rb_next(&waiter->tree_entry); - - rb_erase(&waiter->tree_entry, &lock->waiters); + rb_erase_cached(&waiter->tree_entry, &lock->waiters); RB_CLEAR_NODE(&waiter->tree_entry); } static void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node **link = &task->pi_waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - task->pi_waiters_leftmost = &waiter->pi_tree_entry; - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); + rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); } static void @@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) return; - if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) - task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); - - rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); RB_CLEAR_NODE(&waiter->pi_tree_entry); } @@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT; - lock->waiters_leftmost = NULL; + lock->waiters = RB_ROOT_CACHED; if (name && key) debug_rt_mutex_init(lock, name, key); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 8d039b928d61..7453be0485a5 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -45,7 +45,7 @@ struct rt_mutex_waiter { static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !RB_EMPTY_ROOT(&lock->waiters); + return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } static inline struct rt_mutex_waiter * @@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, - tree_entry); + w = rb_entry(lock->waiters.rb_leftmost, + struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); return w; @@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !RB_EMPTY_ROOT(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + return rb_entry(p->pi_waiters.rb_leftmost, + struct rt_mutex_waiter, pi_tree_entry); } #else -- cgit v1.2.3 From 235586939d7fe4833ada9e988f92af543ee6851f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 8 Sep 2017 16:17:00 -0700 Subject: kmod: split out umh code into its own file Patch series "kmod: few code cleanups to split out umh code" The usermode helper has a provenance from the old usb code which first required a usermode helper. Eventually this was shoved into kmod.c and the kernel's modprobe calls was converted over eventually to share the same code. Over time the list of usermode helpers in the kernel has grown -- so kmod is just but one user of the API. This series is a simple logical cleanup which acknowledges the code evolution of the usermode helper and shoves the UMH API into its own dedicated file. This way users of the API can later just include umh.h instead of kmod.h. Note despite the diff state the first patch really is just a code shove, no functional changes are done there. I did use git format-patch -M to generate the patch, but in the end the split was not enough for git to consider it a rename hence the large diffstat. I've put this through 0-day and it gives me their machine compilation blessings with all tests as OK. This patch (of 4): There's a slew of usermode helper users and kmod is just one of them. Split out the usermode helper code into its own file to keep the logic and focus split up. This change provides no functional changes. Link: http://lkml.kernel.org/r/20170810180618.22457-2-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Kees Cook Cc: Dmitry Torokhov Cc: Jessica Yu Cc: Rusty Russell Cc: Michal Marek Cc: Petr Mladek Cc: Miroslav Benes Cc: Josh Poimboeuf Cc: Guenter Roeck Cc: "Eric W. Biederman" Cc: Matt Redfearn Cc: Dan Carpenter Cc: Colin Ian King Cc: Daniel Mentz Cc: David Binderman Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/kmod.c | 560 +------------------------------------------------------ kernel/umh.c | 568 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 571 insertions(+), 559 deletions(-) create mode 100644 kernel/umh.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9c323a6daa46..44abbb0104b6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,7 +5,7 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ - signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ + signal.o sys.o umh.o kmod.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ diff --git a/kernel/kmod.c b/kernel/kmod.c index 2f37acde640b..cdff52974d18 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -1,23 +1,6 @@ /* - kmod, the new module loader (replaces kerneld) - Kirk Petersen - - Reorganized not to be a daemon by Adam Richter, with guidance - from Greg Zornetzer. - - Modified to avoid chroot and file sharing problems. - Mikael Pettersson - - Limit the concurrent number of kmod modprobes to catch loops from - "modprobe needs a service that is in a module". - Keith Owens December 1999 - - Unblock all signals when we exec a usermode process. - Shuu Yamaguchi December 2000 - - call_usermodehelper wait flag, and remove exec_usermodehelper. - Rusty Russell Jan 2003 -*/ + * kmod - the kernel module loader + */ #include #include #include @@ -45,14 +28,6 @@ #include -#define CAP_BSET (void *)1 -#define CAP_PI (void *)2 - -static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; -static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; -static DEFINE_SPINLOCK(umh_sysctl_lock); -static DECLARE_RWSEM(umhelper_sem); - #ifdef CONFIG_MODULES /* * Assuming: @@ -204,534 +179,3 @@ int __request_module(bool wait, const char *fmt, ...) EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ - -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away - * or the caller used UMH_NO_WAIT. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - -/* - * This is the task which runs the usermode application - */ -static int call_usermodehelper_exec_async(void *data) -{ - struct subprocess_info *sub_info = data; - struct cred *new; - int retval; - - spin_lock_irq(¤t->sighand->siglock); - flush_signal_handlers(current, 1); - spin_unlock_irq(¤t->sighand->siglock); - - /* - * Our parent (unbound workqueue) runs with elevated scheduling - * priority. Avoid propagating that into the userspace child. - */ - set_user_nice(current, 0); - - retval = -ENOMEM; - new = prepare_kernel_cred(current); - if (!new) - goto out; - - spin_lock(&umh_sysctl_lock); - new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); - new->cap_inheritable = cap_intersect(usermodehelper_inheritable, - new->cap_inheritable); - spin_unlock(&umh_sysctl_lock); - - if (sub_info->init) { - retval = sub_info->init(sub_info, new); - if (retval) { - abort_creds(new); - goto out; - } - } - - commit_creds(new); - - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); -out: - sub_info->retval = retval; - /* - * call_usermodehelper_exec_sync() will call umh_complete - * if UHM_WAIT_PROC. - */ - if (!(sub_info->wait & UMH_WAIT_PROC)) - umh_complete(sub_info); - if (!retval) - return 0; - do_exit(0); -} - -/* Handles UMH_WAIT_PROC. */ -static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) -{ - pid_t pid; - - /* If SIGCLD is ignored sys_wait4 won't populate the status. */ - kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - } else { - int ret = -ECHILD; - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But call_usermodehelper_exec_sync() always runs as kernel - * thread (workqueue) and put_user() to a kernel address works - * OK for kernel threads, due to their having an mm_segment_t - * which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - sys_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either call_usermodehelper_exec_async failed and - * the real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } - - /* Restore default kernel sig handler */ - kernel_sigaction(SIGCHLD, SIG_IGN); - - umh_complete(sub_info); -} - -/* - * We need to create the usermodehelper kernel thread from a task that is affine - * to an optimized set of CPUs (or nohz housekeeping ones) such that they - * inherit a widest affinity irrespective of call_usermodehelper() callers with - * possibly reduced affinity (eg: per-cpu workqueues). We don't want - * usermodehelper targets to contend a busy CPU. - * - * Unbound workqueues provide such wide affinity and allow to block on - * UMH_WAIT_PROC requests without blocking pending request (up to some limit). - * - * Besides, workqueues provide the privilege level that caller might not have - * to perform the usermodehelper request. - * - */ -static void call_usermodehelper_exec_work(struct work_struct *work) -{ - struct subprocess_info *sub_info = - container_of(work, struct subprocess_info, work); - - if (sub_info->wait & UMH_WAIT_PROC) { - call_usermodehelper_exec_sync(sub_info); - } else { - pid_t pid; - /* - * Use CLONE_PARENT to reparent it to kthreadd; we do not - * want to pollute current->children, and we need a parent - * that always ignores SIGCHLD to ensure auto-reaping. - */ - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - CLONE_PARENT | SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - umh_complete(sub_info); - } - } -} - -/* - * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY - * (used for preventing user land processes from being created after the user - * land has been frozen during a system-wide hibernation or suspend operation). - * Should always be manipulated under umhelper_sem acquired for write. - */ -static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; - -/* Number of helpers running */ -static atomic_t running_helpers = ATOMIC_INIT(0); - -/* - * Wait queue head used by usermodehelper_disable() to wait for all running - * helpers to finish. - */ -static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); - -/* - * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled - * to become 'false'. - */ -static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); - -/* - * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_disable() fails - */ -#define RUNNING_HELPERS_TIMEOUT (5 * HZ) - -int usermodehelper_read_trylock(void) -{ - DEFINE_WAIT(wait); - int ret = 0; - - down_read(&umhelper_sem); - for (;;) { - prepare_to_wait(&usermodehelper_disabled_waitq, &wait, - TASK_INTERRUPTIBLE); - if (!usermodehelper_disabled) - break; - - if (usermodehelper_disabled == UMH_DISABLED) - ret = -EAGAIN; - - up_read(&umhelper_sem); - - if (ret) - break; - - schedule(); - try_to_freeze(); - - down_read(&umhelper_sem); - } - finish_wait(&usermodehelper_disabled_waitq, &wait); - return ret; -} -EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); - -long usermodehelper_read_lock_wait(long timeout) -{ - DEFINE_WAIT(wait); - - if (timeout < 0) - return -EINVAL; - - down_read(&umhelper_sem); - for (;;) { - prepare_to_wait(&usermodehelper_disabled_waitq, &wait, - TASK_UNINTERRUPTIBLE); - if (!usermodehelper_disabled) - break; - - up_read(&umhelper_sem); - - timeout = schedule_timeout(timeout); - if (!timeout) - break; - - down_read(&umhelper_sem); - } - finish_wait(&usermodehelper_disabled_waitq, &wait); - return timeout; -} -EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); - -void usermodehelper_read_unlock(void) -{ - up_read(&umhelper_sem); -} -EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); - -/** - * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. - * @depth: New value to assign to usermodehelper_disabled. - * - * Change the value of usermodehelper_disabled (under umhelper_sem locked for - * writing) and wakeup tasks waiting for it to change. - */ -void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) -{ - down_write(&umhelper_sem); - usermodehelper_disabled = depth; - wake_up(&usermodehelper_disabled_waitq); - up_write(&umhelper_sem); -} - -/** - * __usermodehelper_disable - Prevent new helpers from being started. - * @depth: New value to assign to usermodehelper_disabled. - * - * Set usermodehelper_disabled to @depth and wait for running helpers to exit. - */ -int __usermodehelper_disable(enum umh_disable_depth depth) -{ - long retval; - - if (!depth) - return -EINVAL; - - down_write(&umhelper_sem); - usermodehelper_disabled = depth; - up_write(&umhelper_sem); - - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, - atomic_read(&running_helpers) == 0, - RUNNING_HELPERS_TIMEOUT); - if (retval) - return 0; - - __usermodehelper_set_disable_depth(UMH_ENABLED); - return -EAGAIN; -} - -static void helper_lock(void) -{ - atomic_inc(&running_helpers); - smp_mb__after_atomic(); -} - -static void helper_unlock(void) -{ - if (atomic_dec_and_test(&running_helpers)) - wake_up(&running_helpers_waitq); -} - -/** - * call_usermodehelper_setup - prepare to call a usermode helper - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @gfp_mask: gfp mask for memory allocation - * @cleanup: a cleanup function - * @init: an init function - * @data: arbitrary context sensitive data - * - * Returns either %NULL on allocation failure, or a subprocess_info - * structure. This should be passed to call_usermodehelper_exec to - * exec the process and free the structure. - * - * The init function is used to customize the helper process prior to - * exec. A non-zero return code causes the process to error out, exit, - * and return the failure to the calling process - * - * The cleanup function is just before ethe subprocess_info is about to - * be freed. This can be used for freeing the argv and envp. The - * Function must be runnable in either a process context or the - * context in which call_usermodehelper_exec is called. - */ -struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, - char **envp, gfp_t gfp_mask, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), - void *data) -{ - struct subprocess_info *sub_info; - sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); - if (!sub_info) - goto out; - - INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); - -#ifdef CONFIG_STATIC_USERMODEHELPER - sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; -#else - sub_info->path = path; -#endif - sub_info->argv = argv; - sub_info->envp = envp; - - sub_info->cleanup = cleanup; - sub_info->init = init; - sub_info->data = data; - out: - return sub_info; -} -EXPORT_SYMBOL(call_usermodehelper_setup); - -/** - * call_usermodehelper_exec - start a usermode application - * @sub_info: information about the subprocessa - * @wait: wait for the application to finish and return status. - * when UMH_NO_WAIT don't wait at all, but you get no useful error back - * when the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * Runs a user-space application. The application is started - * asynchronously if wait is not set, and runs as a child of system workqueues. - * (ie. it runs with full root capabilities and optimized affinity). - */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) -{ - DECLARE_COMPLETION_ONSTACK(done); - int retval = 0; - - if (!sub_info->path) { - call_usermodehelper_freeinfo(sub_info); - return -EINVAL; - } - helper_lock(); - if (usermodehelper_disabled) { - retval = -EBUSY; - goto out; - } - - /* - * If there is no binary for us to call, then just return and get out of - * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and - * disable all call_usermodehelper() calls. - */ - if (strlen(sub_info->path) == 0) - goto out; - - /* - * Set the completion pointer only if there is a waiter. - * This makes it possible to use umh_complete to free - * the data structure in case of UMH_NO_WAIT. - */ - sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; - sub_info->wait = wait; - - queue_work(system_unbound_wq, &sub_info->work); - if (wait == UMH_NO_WAIT) /* task has freed sub_info */ - goto unlock; - - if (wait & UMH_KILLABLE) { - retval = wait_for_completion_killable(&done); - if (!retval) - goto wait_done; - - /* umh_complete() will see NULL and free sub_info */ - if (xchg(&sub_info->complete, NULL)) - goto unlock; - /* fallthrough, umh_complete() was already called */ - } - - wait_for_completion(&done); -wait_done: - retval = sub_info->retval; -out: - call_usermodehelper_freeinfo(sub_info); -unlock: - helper_unlock(); - return retval; -} -EXPORT_SYMBOL(call_usermodehelper_exec); - -/** - * call_usermodehelper() - prepare and start a usermode application - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @wait: wait for the application to finish and return status. - * when UMH_NO_WAIT don't wait at all, but you get no useful error back - * when the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * This function is the equivalent to use call_usermodehelper_setup() and - * call_usermodehelper_exec(). - */ -int call_usermodehelper(const char *path, char **argv, char **envp, int wait) -{ - struct subprocess_info *info; - gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; - - info = call_usermodehelper_setup(path, argv, envp, gfp_mask, - NULL, NULL, NULL); - if (info == NULL) - return -ENOMEM; - - return call_usermodehelper_exec(info, wait); -} -EXPORT_SYMBOL(call_usermodehelper); - -static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; - kernel_cap_t new_cap; - int err, i; - - if (write && (!capable(CAP_SETPCAP) || - !capable(CAP_SYS_MODULE))) - return -EPERM; - - /* - * convert from the global kernel_cap_t to the ulong array to print to - * userspace if this is a read. - */ - spin_lock(&umh_sysctl_lock); - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { - if (table->data == CAP_BSET) - cap_array[i] = usermodehelper_bset.cap[i]; - else if (table->data == CAP_PI) - cap_array[i] = usermodehelper_inheritable.cap[i]; - else - BUG(); - } - spin_unlock(&umh_sysctl_lock); - - t = *table; - t.data = &cap_array; - - /* - * actually read or write and array of ulongs from userspace. Remember - * these are least significant 32 bits first - */ - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - /* - * convert from the sysctl array of ulongs to the kernel_cap_t - * internal representation - */ - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) - new_cap.cap[i] = cap_array[i]; - - /* - * Drop everything not in the new_cap (but don't add things) - */ - spin_lock(&umh_sysctl_lock); - if (write) { - if (table->data == CAP_BSET) - usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); - if (table->data == CAP_PI) - usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); - } - spin_unlock(&umh_sysctl_lock); - - return 0; -} - -struct ctl_table usermodehelper_table[] = { - { - .procname = "bset", - .data = CAP_BSET, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { - .procname = "inheritable", - .data = CAP_PI, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { } -}; diff --git a/kernel/umh.c b/kernel/umh.c new file mode 100644 index 000000000000..6ff9905250ff --- /dev/null +++ b/kernel/umh.c @@ -0,0 +1,568 @@ +/* + * umh - the kernel usermode helper + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define CAP_BSET (void *)1 +#define CAP_PI (void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); +static DECLARE_RWSEM(umhelper_sem); + +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + +/* + * This is the task which runs the usermode application + */ +static int call_usermodehelper_exec_async(void *data) +{ + struct subprocess_info *sub_info = data; + struct cred *new; + int retval; + + spin_lock_irq(¤t->sighand->siglock); + flush_signal_handlers(current, 1); + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Our parent (unbound workqueue) runs with elevated scheduling + * priority. Avoid propagating that into the userspace child. + */ + set_user_nice(current, 0); + + retval = -ENOMEM; + new = prepare_kernel_cred(current); + if (!new) + goto out; + + spin_lock(&umh_sysctl_lock); + new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); + new->cap_inheritable = cap_intersect(usermodehelper_inheritable, + new->cap_inheritable); + spin_unlock(&umh_sysctl_lock); + + if (sub_info->init) { + retval = sub_info->init(sub_info, new); + if (retval) { + abort_creds(new); + goto out; + } + } + + commit_creds(new); + + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* + * call_usermodehelper_exec_sync() will call umh_complete + * if UHM_WAIT_PROC. + */ + if (!(sub_info->wait & UMH_WAIT_PROC)) + umh_complete(sub_info); + if (!retval) + return 0; + do_exit(0); +} + +/* Handles UMH_WAIT_PROC. */ +static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) +{ + pid_t pid; + + /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + kernel_sigaction(SIGCHLD, SIG_DFL); + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + } else { + int ret = -ECHILD; + /* + * Normally it is bogus to call wait4() from in-kernel because + * wait4() wants to write the exit code to a userspace address. + * But call_usermodehelper_exec_sync() always runs as kernel + * thread (workqueue) and put_user() to a kernel address works + * OK for kernel threads, due to their having an mm_segment_t + * which spans the entire address space. + * + * Thus the __user pointer cast is valid here. + */ + sys_wait4(pid, (int __user *)&ret, 0, NULL); + + /* + * If ret is 0, either call_usermodehelper_exec_async failed and + * the real error code is already in sub_info->retval or + * sub_info->retval is 0 anyway, so don't mess with it then. + */ + if (ret) + sub_info->retval = ret; + } + + /* Restore default kernel sig handler */ + kernel_sigaction(SIGCHLD, SIG_IGN); + + umh_complete(sub_info); +} + +/* + * We need to create the usermodehelper kernel thread from a task that is affine + * to an optimized set of CPUs (or nohz housekeeping ones) such that they + * inherit a widest affinity irrespective of call_usermodehelper() callers with + * possibly reduced affinity (eg: per-cpu workqueues). We don't want + * usermodehelper targets to contend a busy CPU. + * + * Unbound workqueues provide such wide affinity and allow to block on + * UMH_WAIT_PROC requests without blocking pending request (up to some limit). + * + * Besides, workqueues provide the privilege level that caller might not have + * to perform the usermodehelper request. + * + */ +static void call_usermodehelper_exec_work(struct work_struct *work) +{ + struct subprocess_info *sub_info = + container_of(work, struct subprocess_info, work); + + if (sub_info->wait & UMH_WAIT_PROC) { + call_usermodehelper_exec_sync(sub_info); + } else { + pid_t pid; + /* + * Use CLONE_PARENT to reparent it to kthreadd; we do not + * want to pollute current->children, and we need a parent + * that always ignores SIGCHLD to ensure auto-reaping. + */ + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, + CLONE_PARENT | SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + umh_complete(sub_info); + } + } +} + +/* + * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY + * (used for preventing user land processes from being created after the user + * land has been frozen during a system-wide hibernation or suspend operation). + * Should always be manipulated under umhelper_sem acquired for write. + */ +static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; + +/* Number of helpers running */ +static atomic_t running_helpers = ATOMIC_INIT(0); + +/* + * Wait queue head used by usermodehelper_disable() to wait for all running + * helpers to finish. + */ +static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); + +/* + * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled + * to become 'false'. + */ +static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); + +/* + * Time to wait for running_helpers to become zero before the setting of + * usermodehelper_disabled in usermodehelper_disable() fails + */ +#define RUNNING_HELPERS_TIMEOUT (5 * HZ) + +int usermodehelper_read_trylock(void) +{ + DEFINE_WAIT(wait); + int ret = 0; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_INTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + if (usermodehelper_disabled == UMH_DISABLED) + ret = -EAGAIN; + + up_read(&umhelper_sem); + + if (ret) + break; + + schedule(); + try_to_freeze(); + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return ret; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); + +long usermodehelper_read_lock_wait(long timeout) +{ + DEFINE_WAIT(wait); + + if (timeout < 0) + return -EINVAL; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_UNINTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + up_read(&umhelper_sem); + + timeout = schedule_timeout(timeout); + if (!timeout) + break; + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return timeout; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); + +void usermodehelper_read_unlock(void) +{ + up_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); + +/** + * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. + * @depth: New value to assign to usermodehelper_disabled. + * + * Change the value of usermodehelper_disabled (under umhelper_sem locked for + * writing) and wakeup tasks waiting for it to change. + */ +void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) +{ + down_write(&umhelper_sem); + usermodehelper_disabled = depth; + wake_up(&usermodehelper_disabled_waitq); + up_write(&umhelper_sem); +} + +/** + * __usermodehelper_disable - Prevent new helpers from being started. + * @depth: New value to assign to usermodehelper_disabled. + * + * Set usermodehelper_disabled to @depth and wait for running helpers to exit. + */ +int __usermodehelper_disable(enum umh_disable_depth depth) +{ + long retval; + + if (!depth) + return -EINVAL; + + down_write(&umhelper_sem); + usermodehelper_disabled = depth; + up_write(&umhelper_sem); + + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, + atomic_read(&running_helpers) == 0, + RUNNING_HELPERS_TIMEOUT); + if (retval) + return 0; + + __usermodehelper_set_disable_depth(UMH_ENABLED); + return -EAGAIN; +} + +static void helper_lock(void) +{ + atomic_inc(&running_helpers); + smp_mb__after_atomic(); +} + +static void helper_unlock(void) +{ + if (atomic_dec_and_test(&running_helpers)) + wake_up(&running_helpers_waitq); +} + +/** + * call_usermodehelper_setup - prepare to call a usermode helper + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @gfp_mask: gfp mask for memory allocation + * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data + * + * Returns either %NULL on allocation failure, or a subprocess_info + * structure. This should be passed to call_usermodehelper_exec to + * exec the process and free the structure. + * + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. + */ +struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, + char **envp, gfp_t gfp_mask, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), + void *data) +{ + struct subprocess_info *sub_info; + sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); + if (!sub_info) + goto out; + + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + +#ifdef CONFIG_STATIC_USERMODEHELPER + sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; +#else + sub_info->path = path; +#endif + sub_info->argv = argv; + sub_info->envp = envp; + + sub_info->cleanup = cleanup; + sub_info->init = init; + sub_info->data = data; + out: + return sub_info; +} +EXPORT_SYMBOL(call_usermodehelper_setup); + +/** + * call_usermodehelper_exec - start a usermode application + * @sub_info: information about the subprocessa + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * Runs a user-space application. The application is started + * asynchronously if wait is not set, and runs as a child of system workqueues. + * (ie. it runs with full root capabilities and optimized affinity). + */ +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) +{ + DECLARE_COMPLETION_ONSTACK(done); + int retval = 0; + + if (!sub_info->path) { + call_usermodehelper_freeinfo(sub_info); + return -EINVAL; + } + helper_lock(); + if (usermodehelper_disabled) { + retval = -EBUSY; + goto out; + } + + /* + * If there is no binary for us to call, then just return and get out of + * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and + * disable all call_usermodehelper() calls. + */ + if (strlen(sub_info->path) == 0) + goto out; + + /* + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. + */ + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; + sub_info->wait = wait; + + queue_work(system_unbound_wq, &sub_info->work); + if (wait == UMH_NO_WAIT) /* task has freed sub_info */ + goto unlock; + + if (wait & UMH_KILLABLE) { + retval = wait_for_completion_killable(&done); + if (!retval) + goto wait_done; + + /* umh_complete() will see NULL and free sub_info */ + if (xchg(&sub_info->complete, NULL)) + goto unlock; + /* fallthrough, umh_complete() was already called */ + } + + wait_for_completion(&done); +wait_done: + retval = sub_info->retval; +out: + call_usermodehelper_freeinfo(sub_info); +unlock: + helper_unlock(); + return retval; +} +EXPORT_SYMBOL(call_usermodehelper_exec); + +/** + * call_usermodehelper() - prepare and start a usermode application + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * This function is the equivalent to use call_usermodehelper_setup() and + * call_usermodehelper_exec(). + */ +int call_usermodehelper(const char *path, char **argv, char **envp, int wait) +{ + struct subprocess_info *info; + gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; + + info = call_usermodehelper_setup(path, argv, envp, gfp_mask, + NULL, NULL, NULL); + if (info == NULL) + return -ENOMEM; + + return call_usermodehelper_exec(info, wait); +} +EXPORT_SYMBOL(call_usermodehelper); + +static int proc_cap_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; + kernel_cap_t new_cap; + int err, i; + + if (write && (!capable(CAP_SETPCAP) || + !capable(CAP_SYS_MODULE))) + return -EPERM; + + /* + * convert from the global kernel_cap_t to the ulong array to print to + * userspace if this is a read. + */ + spin_lock(&umh_sysctl_lock); + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { + if (table->data == CAP_BSET) + cap_array[i] = usermodehelper_bset.cap[i]; + else if (table->data == CAP_PI) + cap_array[i] = usermodehelper_inheritable.cap[i]; + else + BUG(); + } + spin_unlock(&umh_sysctl_lock); + + t = *table; + t.data = &cap_array; + + /* + * actually read or write and array of ulongs from userspace. Remember + * these are least significant 32 bits first + */ + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + /* + * convert from the sysctl array of ulongs to the kernel_cap_t + * internal representation + */ + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) + new_cap.cap[i] = cap_array[i]; + + /* + * Drop everything not in the new_cap (but don't add things) + */ + spin_lock(&umh_sysctl_lock); + if (write) { + if (table->data == CAP_BSET) + usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); + if (table->data == CAP_PI) + usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + } + spin_unlock(&umh_sysctl_lock); + + return 0; +} + +struct ctl_table usermodehelper_table[] = { + { + .procname = "bset", + .data = CAP_BSET, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { + .procname = "inheritable", + .data = CAP_PI, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { } +}; -- cgit v1.2.3 From 0ce2c2029312ed78e37b56b08fa0f59ba97ef50b Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 8 Sep 2017 16:17:12 -0700 Subject: kmod: move #ifdef CONFIG_MODULES wrapper to Makefile The entire file is now conditionally compiled only when CONFIG_MODULES is enabled, and this this is a bool. Just move this conditional to the Makefile as its easier to read this way. Link: http://lkml.kernel.org/r/20170810180618.22457-5-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Kees Cook Cc: Dmitry Torokhov Cc: Jessica Yu Cc: Rusty Russell Cc: Michal Marek Cc: Petr Mladek Cc: Miroslav Benes Cc: Josh Poimboeuf Cc: Guenter Roeck Cc: "Eric W. Biederman" Cc: Matt Redfearn Cc: Dan Carpenter Cc: Colin Ian King Cc: Daniel Mentz Cc: David Binderman Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 3 ++- kernel/kmod.c | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 44abbb0104b6..ed470aac53da 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,12 +5,13 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ - signal.o sys.o umh.o kmod.o workqueue.o pid.o task_work.o \ + signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o +obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/kmod.c b/kernel/kmod.c index cdff52974d18..bc6addd9152b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -28,7 +28,6 @@ #include -#ifdef CONFIG_MODULES /* * Assuming: * @@ -177,5 +176,3 @@ int __request_module(bool wait, const char *fmt, ...) return ret; } EXPORT_SYMBOL(__request_module); - -#endif /* CONFIG_MODULES */ -- cgit v1.2.3 From a2d818030135c293f878fbb772cf40e7a14c5acc Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Fri, 8 Sep 2017 16:17:19 -0700 Subject: drivers/pps: aesthetic tweaks to PPS-related content Collection of aesthetic adjustments to various PPS-related files, directories and Documentation, some quite minor just for the sake of consistency, including: * Updated example of pps device tree node (courtesy Rodolfo G.) * "PPS-API" -> "PPS API" * "pps_source_info_s" -> "pps_source_info" * "ktimer driver" -> "pps-ktimer driver" * "ppstest /dev/pps0" -> "ppstest /dev/pps1" to match example * Add missing PPS-related entries to MAINTAINERS file * Other trivialities Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1708261048220.8106@localhost.localdomain Signed-off-by: Robert P. J. Day Acked-by: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8ea4fb315719..2cafb49aa65e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); -#endif +#endif /* CONFIG_NTP_PPS */ /** * xtime_update() - advances the timekeeping infrastructure -- cgit v1.2.3 From 7483e5d420d9d5aa1732c5efb0da59e095a8b24e Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Fri, 8 Sep 2017 16:17:35 -0700 Subject: kcov: support compat processes Support compat processes in KCOV by providing compat_ioctl callback. Compat mode uses the same ioctl callback: we have 2 commands that do not use the argument and 1 that already checks that the arg does not overflow INT_MAX. This allows to use KCOV-guided fuzzing in compat processes. Link: http://lkml.kernel.org/r/20170823100553.55812-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index cd771993f96f..3f693a0f6f3e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -270,6 +270,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) static const struct file_operations kcov_fops = { .open = kcov_open, .unlocked_ioctl = kcov_ioctl, + .compat_ioctl = kcov_ioctl, .mmap = kcov_mmap, .release = kcov_close, }; -- cgit v1.2.3 From 109980b894e9dae66c37c3d804a415aa68b19c7e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 8 Sep 2017 00:14:51 +0200 Subject: bpf: don't select potentially stale ri->map from buggy xdp progs We can potentially run into a couple of issues with the XDP bpf_redirect_map() helper. The ri->map in the per CPU storage can become stale in several ways, mostly due to misuse, where we can then trigger a use after free on the map: i) prog A is calling bpf_redirect_map(), returning XDP_REDIRECT and running on a driver not supporting XDP_REDIRECT yet. The ri->map on that CPU becomes stale when the XDP program is unloaded on the driver, and a prog B loaded on a different driver which supports XDP_REDIRECT return code. prog B would have to omit calling to bpf_redirect_map() and just return XDP_REDIRECT, which would then access the freed map in xdp_do_redirect() since not cleared for that CPU. ii) prog A is calling bpf_redirect_map(), returning a code other than XDP_REDIRECT. prog A is then detached, which triggers release of the map. prog B is attached which, similarly as in i), would just return XDP_REDIRECT without having called bpf_redirect_map() and thus be accessing the freed map in xdp_do_redirect() since not cleared for that CPU. iii) prog A is attached to generic XDP, calling the bpf_redirect_map() helper and returning XDP_REDIRECT. xdp_do_generic_redirect() is currently not handling ri->map (will be fixed by Jesper), so it's not being reset. Later loading a e.g. native prog B which would, say, call bpf_xdp_redirect() and then returns XDP_REDIRECT would find in xdp_do_redirect() that a map was set and uses that causing use after free on map access. Fix thus needs to avoid accessing stale ri->map pointers, naive way would be to call a BPF function from drivers that just resets it to NULL for all XDP return codes but XDP_REDIRECT and including XDP_REDIRECT for drivers not supporting it yet (and let ri->map being handled in xdp_do_generic_redirect()). There is a less intrusive way w/o letting drivers call a reset for each BPF run. The verifier knows we're calling into bpf_xdp_redirect_map() helper, so it can do a small insn rewrite transparent to the prog itself in the sense that it fills R4 with a pointer to the own bpf_prog. We have that pointer at verification time anyway and R4 is allowed to be used as per calling convention we scratch R0 to R5 anyway, so they become inaccessible and program cannot read them prior to a write. Then, the helper would store the prog pointer in the current CPUs struct redirect_info. Later in xdp_do_*_redirect() we check whether the redirect_info's prog pointer is the same as passed xdp_prog pointer, and if that's the case then all good, since the prog holds a ref on the map anyway, so it is always valid at that point in time and must have a reference count of at least 1. If in the unlikely case they are not equal, it means we got a stale pointer, so we clear and bail out right there. Also do reset map and the owning prog in bpf_xdp_redirect(), so that bpf_xdp_redirect_map() and bpf_xdp_redirect() won't get mixed up, only the last call should take precedence. A tc bpf_redirect() doesn't use map anywhere yet, so no need to clear it there since never accessed in that layer. Note that in case the prog is released, and thus the map as well we're still under RCU read critical section at that time and have preemption disabled as well. Once we commit with the __dev_map_insert_ctx() from xdp_do_redirect_map() and set the map to ri->map_to_flush, we still wait for a xdp_do_flush_map() to finish in devmap dismantle time once flush_needed bit is set, so that is fine. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Reported-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d690c7dd1f1a..477b6932c3c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4203,6 +4203,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->imm == BPF_FUNC_redirect_map) { + u64 addr = (unsigned long)prog; + struct bpf_insn r4_ld[] = { + BPF_LD_IMM64(BPF_REG_4, addr), + *insn, + }; + cnt = ARRAY_SIZE(r4_ld); + + new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + } patch_call_imm: fn = prog->aux->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed -- cgit v1.2.3 From 5a67da2a71c64daeb456f6f3e87b5c7cecdc5ffa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 8 Sep 2017 14:00:49 -0700 Subject: bpf: add support for sockmap detach programs The bpf map sockmap supports adding programs via attach commands. This patch adds the detach command to keep the API symmetric and allow users to remove previously added programs. Otherwise the user would have to delete the map and re-add it to get in this state. This also adds a series of additional tests to capture detach operation and also attaching/detaching invalid prog types. API note: socks will run (or not run) programs depending on the state of the map at the time the sock is added. We do not for example walk the map and remove programs from previously attached socks. Acked-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 2 +- kernel/bpf/syscall.c | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index f6ffde9c6a68..6424ce0e4969 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -792,7 +792,7 @@ out_progs: return err; } -int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_prog *orig; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 70ad8e220343..cb17e1cd1d43 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1096,10 +1096,10 @@ static int bpf_obj_get(const union bpf_attr *attr) #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr) +static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) { + struct bpf_prog *prog = NULL; int ufd = attr->target_fd; - struct bpf_prog *prog; struct bpf_map *map; struct fd f; int err; @@ -1109,16 +1109,20 @@ static int sockmap_get_from_fd(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB); - if (IS_ERR(prog)) { - fdput(f); - return PTR_ERR(prog); + if (attach) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_SK_SKB); + if (IS_ERR(prog)) { + fdput(f); + return PTR_ERR(prog); + } } - err = sock_map_attach_prog(map, prog, attr->attach_type); + err = sock_map_prog(map, prog, attr->attach_type); if (err) { fdput(f); - bpf_prog_put(prog); + if (prog) + bpf_prog_put(prog); return err; } @@ -1155,7 +1159,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr); + return sockmap_get_from_fd(attr, true); default: return -EINVAL; } @@ -1204,7 +1208,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); cgroup_put(cgrp); break; - + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + ret = sockmap_get_from_fd(attr, false); + break; default: return -EINVAL; } -- cgit v1.2.3 From 374fb014fc5b15e420faa00af036868a635eadd3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 8 Sep 2017 14:01:10 -0700 Subject: bpf: devmap, use cond_resched instead of cpu_relax Be a bit more friendly about waiting for flush bits to complete. Replace the cpu_relax() with a cond_resched(). Suggested-by: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ecf9f99ecc57..959c9a07f318 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -159,7 +159,7 @@ static void dev_map_free(struct bpf_map *map) unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); while (!bitmap_empty(bitmap, dtab->map.max_entries)) - cpu_relax(); + cond_resched(); } for (i = 0; i < dtab->map.max_entries; i++) { -- cgit v1.2.3 From 46123355af729514e6fa8b8a9dd1e645e61a6466 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 10 Sep 2017 09:55:05 -0700 Subject: sched/fair: Fix nuisance kernel-doc warning Work around kernel-doc warning ('*' in Sphinx doc means "emphasis"): ../kernel/sched/fair.c:7584: WARNING: Inline emphasis start-string without end-string. Signed-off-by: Randy Dunlap Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f18b30f9-6251-6d86-9d44-16501e386891@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9dd2ce1e5ca2..8415d1ec2b84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7719,7 +7719,7 @@ next_group: * number. * * Return: 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in *imbalance. + * this CPU. The amount of the imbalance is returned in env->imbalance. * * @env: The load balancing environment. * @sds: Statistics of the sched_domain which is to be packed -- cgit v1.2.3 From 609320c8a22715b74b39796930c3542719f8ab62 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 7 Sep 2017 18:36:15 -0700 Subject: perf/bpf: fix a clang compilation issue clang does not support variable length array for structure member. It has the following error during compilation: kernel/trace/trace_syscalls.c:568:17: error: fields must have a constant size: 'variable length array in structure' extension will never be supported unsigned long args[sys_data->nb_args]; ^ The fix is to use a fixed array length instead. Reported-by: Nick Desaulniers Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- kernel/trace/trace_syscalls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9c4eef20301c..696afe72d3b1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -565,7 +565,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; - unsigned long args[sys_data->nb_args]; + unsigned long args[SYSCALL_DEFINE_MAXARGS]; } param; int i; -- cgit v1.2.3 From 2800486ee34825d954f64c6f98037daea328f121 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 17:03:50 +0200 Subject: sched/fair: Avoid newidle balance for !active CPUs On CPU hot unplug, when parking the last kthread we'll try and schedule into idle to kill the CPU. This last schedule can (and does) trigger newidle balance because at this point the sched domains are still up because of commit: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds") Obviously pulling tasks to an already offline CPU is a bad idea, and all balancing operations _should_ be subject to cpu_active_mask, make it so. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds") Link: http://lkml.kernel.org/r/20170907150613.994135806@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8415d1ec2b84..3bcea40d3029 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8447,6 +8447,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) */ this_rq->idle_stamp = rq_clock(this_rq); + /* + * Do not pull tasks towards !active CPUs... + */ + if (!cpu_active(this_cpu)) + return 0; + /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding -- cgit v1.2.3 From edd8e41d2e3cbd6ebe13ead30eb1adc6f48cbb33 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 17:03:51 +0200 Subject: sched/fair: Plug hole between hotplug and active_load_balance() The load balancer applies cpu_active_mask to whatever sched_domains it finds, however in the case of active_balance there is a hole between setting rq->{active_balance,push_cpu} and running the stop_machine work doing the actual migration. The @push_cpu can go offline in this window, which would result in us moving a task onto a dead cpu, which is a fairly bad thing. Double check the active mask before the stop work does the migration. CPU0 CPU1 stop_machine(takedown_cpu) load_balance() cpu_stopper_thread() ... work = multi_cpu_stop stop_one_cpu_nowait( /* wait for CPU0 */ .func = active_load_balance_cpu_stop ); cpu_stopper_thread() work = multi_cpu_stop /* sync with CPU1 */ take_cpu_down() play_dead(); work = active_load_balance_cpu_stop set_task_cpu(p, CPU1); /* oops!! */ Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170907150614.044460912@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bcea40d3029..efeebed935ae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8560,6 +8560,13 @@ static int active_load_balance_cpu_stop(void *data) struct rq_flags rf; rq_lock_irq(busiest_rq, &rf); + /* + * Between queueing the stop-work and running it is a hole in which + * CPUs can become inactive. We should not move tasks from or to + * inactive CPUs. + */ + if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) + goto out_unlock; /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || -- cgit v1.2.3 From 4ff9083b8a9a80bdf4ebbbec22cda4cbfb60f7aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 17:03:52 +0200 Subject: sched/core: WARN() when migrating to an offline CPU Migrating tasks to offline CPUs is a pretty big fail, warn about it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170907150614.094206976@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 136a76d80dbf..18a6966567da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1173,6 +1173,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif + /* + * Clearly, migrating tasks to offline CPUs is a fairly daft thing. + */ + WARN_ON_ONCE(!cpu_online(new_cpu)); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v1.2.3 From 9469eb01db891b55367ee7539f1b9f7f6fd2819d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 17:03:53 +0200 Subject: sched/debug: Add debugfs knob for "sched_debug" I'm forever late for editing my kernel cmdline, add a runtime knob to disable the "sched_debug" thing. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170907150614.142924283@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 5 +++++ kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 4 +--- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4a23bbc3111b..b19d06ea6e10 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -181,11 +181,16 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; +__read_mostly bool sched_debug_enabled; + static __init int sched_init_debug(void) { debugfs_create_file("sched_features", 0644, NULL, NULL, &sched_feat_fops); + debugfs_create_bool("sched_debug", 0644, NULL, + &sched_debug_enabled); + return 0; } late_initcall(sched_init_debug); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ab1c7f5409a0..7ea2a0339771 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1954,6 +1954,8 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG +extern bool sched_debug_enabled; + extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6f7b43982f73..2ab2aa68c796 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -14,11 +14,9 @@ cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_debug_enabled; - static int __init sched_debug_setup(char *str) { - sched_debug_enabled = 1; + sched_debug_enabled = true; return 0; } -- cgit v1.2.3 From 0ee931c4e31a5efb134c76440405e9219f896e33 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 13 Sep 2017 16:28:29 -0700 Subject: mm: treewide: remove GFP_TEMPORARY allocation flag GFP_TEMPORARY was introduced by commit e12ba74d8ff3 ("Group short-lived and reclaimable kernel allocations") along with __GFP_RECLAIMABLE. It's primary motivation was to allow users to tell that an allocation is short lived and so the allocator can try to place such allocations close together and prevent long term fragmentation. As much as this sounds like a reasonable semantic it becomes much less clear when to use the highlevel GFP_TEMPORARY allocation flag. How long is temporary? Can the context holding that memory sleep? Can it take locks? It seems there is no good answer for those questions. The current implementation of GFP_TEMPORARY is basically GFP_KERNEL | __GFP_RECLAIMABLE which in itself is tricky because basically none of the existing caller provide a way to reclaim the allocated memory. So this is rather misleading and hard to evaluate for any benefits. I have checked some random users and none of them has added the flag with a specific justification. I suspect most of them just copied from other existing users and others just thought it might be a good idea to use without any measuring. This suggests that GFP_TEMPORARY just motivates for cargo cult usage without any reasoning. I believe that our gfp flags are quite complex already and especially those with highlevel semantic should be clearly defined to prevent from confusion and abuse. Therefore I propose dropping GFP_TEMPORARY and replace all existing users to simply use GFP_KERNEL. Please note that SLAB users with shrinkers will still get __GFP_RECLAIMABLE heuristic and so they will be placed properly for memory fragmentation prevention. I can see reasons we might want some gfp flag to reflect shorterm allocations but I propose starting from a clear semantic definition and only then add users with proper justification. This was been brought up before LSF this year by Matthew [1] and it turned out that GFP_TEMPORARY really doesn't have a clear semantic. It seems to be a heuristic without any measured advantage for most (if not all) its current users. The follow up discussion has revealed that opinions on what might be temporary allocation differ a lot between developers. So rather than trying to tweak existing users into a semantic which they haven't expected I propose to simply remove the flag and start from scratch if we really need a semantic for short term allocations. [1] http://lkml.kernel.org/r/20170118054945.GD18349@bombadil.infradead.org [akpm@linux-foundation.org: fix typo] [akpm@linux-foundation.org: coding-style fixes] [sfr@canb.auug.org.au: drm/i915: fix up] Link: http://lkml.kernel.org/r/20170816144703.378d4f4d@canb.auug.org.au Link: http://lkml.kernel.org/r/20170728091904.14627-1-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Stephen Rothwell Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Matthew Wilcox Cc: Neil Brown Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/locking/test-ww_mutex.c | 2 +- kernel/trace/trace_events_filter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 39f56c870051..0e4cd64ad2c0 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -362,7 +362,7 @@ static int *get_random_order(int count) int *order; int n, r, tmp; - order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY); + order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); if (!order) return order; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 181e139a8057..61e7f0678d33 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -702,7 +702,7 @@ static void append_filter_err(struct filter_parse_state *ps, int pos = ps->lasterr_pos; char *buf, *pbuf; - buf = (char *)__get_free_page(GFP_TEMPORARY); + buf = (char *)__get_free_page(GFP_KERNEL); if (!buf) return; -- cgit v1.2.3 From d0b6e0a8ef24b1b07078ababe5d91bcdf4f4264a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Sep 2017 21:36:55 +0200 Subject: watchdog/hardlockup: Provide interface to stop/restart perf events Provide an interface to stop and restart perf NMI watchdog events on all CPUs. This is only usable during init and especially for handling the perf HT bug on Intel machines. It's safe to use it this way as nothing can start/stop the NMI watchdog in parallel. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.167649596@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog_hld.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 3a09ea1b1d3d..c9586ebc2e98 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -261,3 +261,44 @@ void watchdog_nmi_disable(unsigned int cpu) firstcpu_err = 0; } } + +/** + * hardlockup_detector_perf_stop - Globally stop watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_stop(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_disable(event); + } +} + +/** + * hardlockup_detector_perf_restart - Globally restart watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_restart(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return; + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_enable(event); + } +} -- cgit v1.2.3 From 6554fd8cf06db86f861bb24d7487b2873ca444c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:36:57 +0200 Subject: watchdog/core: Provide interface to stop from poweroff() PARISC has a a busy looping power off routine. If the watchdog is enabled the watchdog timer will still fire, but the thread is not running, which causes the softlockup watchdog to trigger. Provide a interface which allows to turn the watchdog off. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Helge Deller Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linux-parisc@vger.kernel.org Link: http://lkml.kernel.org/r/20170912194146.327343752@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f5d52024f6b7..f23e373aa3bf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -333,7 +333,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - if (atomic_read(&watchdog_park_in_progress) != 0) + if (!watchdog_enabled || + atomic_read(&watchdog_park_in_progress) != 0) return HRTIMER_NORESTART; /* kick the hardlockup detector */ @@ -660,6 +661,17 @@ static void set_sample_period(void) } #endif /* SOFTLOCKUP */ +/** + * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) + * + * Special interface for parisc. It prevents lockup detector warnings from + * the default pm_poweroff() function which busy loops forever. + */ +void lockup_detector_soft_poweroff(void) +{ + watchdog_enabled = 0; +} + /* * Suspend the hard and soft lockup detector by parking the watchdog threads. */ -- cgit v1.2.3 From 5490125d77a43016b26f629d4b485e2c62172551 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:36:59 +0200 Subject: watchdog/core: Remove broken suspend/resume interfaces This interface has several issues: - It's causing recursive locking of the hotplug lock. - It's complete overkill to teardown all threads and then recreate them The same can be achieved with the simple hardlockup_detector_perf_stop / restart() interfaces. The abuse from the busy looping poweroff() loop of PARISC has been solved as well. Remove the cruft. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.487537732@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 89 +------------------------------------------------------ 1 file changed, 1 insertion(+), 88 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f23e373aa3bf..b2d46757917e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -97,19 +97,6 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); * unregistered/stopped, so it is an indicator whether the threads exist. */ static int __read_mostly watchdog_running; -/* - * If a subsystem has a need to deactivate the watchdog temporarily, it - * can use the suspend/resume interface to achieve this. The content of - * the 'watchdog_suspended' variable reflects this state. Existing threads - * are parked/unparked by the lockup_detector_{suspend|resume} functions - * (see comment blocks pertaining to those functions for further details). - * - * 'watchdog_suspended' also prevents threads from being registered/started - * or unregistered/stopped via parameters in /proc/sys/kernel, so the state - * of 'watchdog_running' cannot change while the watchdog is deactivated - * temporarily (see related code in 'proc' handlers). - */ -int __read_mostly watchdog_suspended; /* * These functions can be overridden if an architecture implements its @@ -136,7 +123,6 @@ void __weak watchdog_nmi_disable(unsigned int cpu) * - watchdog_cpumask * - sysctl_hardlockup_all_cpu_backtrace * - hardlockup_panic - * - watchdog_suspended */ void __weak watchdog_nmi_reconfigure(void) { @@ -672,61 +658,6 @@ void lockup_detector_soft_poweroff(void) watchdog_enabled = 0; } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - #ifdef CONFIG_SYSCTL /* @@ -775,12 +706,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, get_online_cpus(); mutex_lock(&watchdog_proc_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - /* * If the parameter is being read return the state of the corresponding * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the @@ -872,12 +797,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, get_online_cpus(); mutex_lock(&watchdog_proc_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -917,12 +836,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, get_online_cpus(); mutex_lock(&watchdog_proc_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { /* Remove impossible cpus to keep sysctl output cleaner. */ @@ -941,7 +854,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, watchdog_nmi_reconfigure(); } -out: + mutex_unlock(&watchdog_proc_mutex); put_online_cpus(); return err; -- cgit v1.2.3 From b7a349819d4b9b5db64e523351e66a79a758eaa5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:00 +0200 Subject: watchdog/core: Rework CPU hotplug locking The watchdog proc interface causes extensive recursive locking of the CPU hotplug percpu rwsem, which is deadlock prone. Replace the get/put_online_cpus() pairs with cpu_hotplug_disable()/enable() calls for now. Later patches will remove that requirement completely. Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.568079057@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b2d46757917e..cd79f644ea34 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -703,7 +703,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int err, old, new; int *watchdog_param = (int *)table->data; - get_online_cpus(); + cpu_hotplug_disable(); mutex_lock(&watchdog_proc_mutex); /* @@ -752,7 +752,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + cpu_hotplug_enable(); return err; } @@ -794,7 +794,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old, new; - get_online_cpus(); + cpu_hotplug_disable(); mutex_lock(&watchdog_proc_mutex); old = ACCESS_ONCE(watchdog_thresh); @@ -818,7 +818,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + cpu_hotplug_enable(); return err; } @@ -833,7 +833,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; - get_online_cpus(); + cpu_hotplug_disable(); mutex_lock(&watchdog_proc_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); @@ -856,7 +856,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, } mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + cpu_hotplug_enable(); return err; } -- cgit v1.2.3 From 946d197794b23202b8b46c43016747c72fe23393 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:01 +0200 Subject: watchdog/core: Rename watchdog_proc_mutex Following patches will use the mutex for other purposes as well. Rename it as it is not longer a proc specific thing. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.647714850@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cd79f644ea34..7c3a0a76b41b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,8 +29,7 @@ #include #include -/* Watchdog configuration */ -static DEFINE_MUTEX(watchdog_proc_mutex); +static DEFINE_MUTEX(watchdog_mutex); int __read_mostly nmi_watchdog_enabled; @@ -704,7 +703,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int *watchdog_param = (int *)table->data; cpu_hotplug_disable(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); /* * If the parameter is being read return the state of the corresponding @@ -751,7 +750,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, err = proc_watchdog_update(); } out: - mutex_unlock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } @@ -795,7 +794,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, int err, old, new; cpu_hotplug_disable(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -817,7 +816,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, set_sample_period(); } out: - mutex_unlock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } @@ -834,7 +833,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, int err; cpu_hotplug_disable(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { @@ -855,7 +854,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, watchdog_nmi_reconfigure(); } - mutex_unlock(&watchdog_proc_mutex); + mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } -- cgit v1.2.3 From 7a3558200739e1378800a7a6d7f63c031115f7a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:02 +0200 Subject: watchdog/core: Mark hardlockup_detector_disable() __init The function is only used by the KVM init code. Mark it __init to prevent creative abuse. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.727134632@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7c3a0a76b41b..1c185d9dd468 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -55,7 +55,7 @@ unsigned int __read_mostly hardlockup_panic = * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ -void hardlockup_detector_disable(void) +void __init hardlockup_detector_disable(void) { watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; } -- cgit v1.2.3 From 20d853fd0703b1d73c35a22024c0d4fcbcc57c8c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:03 +0200 Subject: watchdog/hardlockup/perf: Remove broken self disable on failure The self disabling feature is broken vs. CPU hotplug locking: CPU 0 CPU 1 cpus_write_lock(); cpu_up(1) wait_for_completion() .... unpark_watchdog() ->unpark() perf_event_create() <- fails watchdog_enable &= ~NMI_WATCHDOG; .... cpus_write_unlock(); CPU 2 cpus_write_lock() cpu_down(2) wait_for_completion() wakeup(watchdog); watchdog() if (!(watchdog_enable & NMI_WATCHDOG)) watchdog_nmi_disable() perf_event_disable() .... cpus_read_lock(); stop_smpboot_threads() park_watchdog(); wait_for_completion(watchdog->parked); Result: End of hotplug and instantaneous full lockup of the machine. There is a similar problem with disabling the watchdog via the user space interface as the sysctl function fiddles with watchdog_enable directly. It's very debatable whether this is required at all. If the watchdog works nicely on N CPUs and it fails to enable on the N + 1 CPU either during hotplug or because the user space interface disabled it via sysctl cpumask and then some perf user grabbed the counter which is then unavailable for the watchdog when the sysctl cpumask gets changed back. There is no real justification for this. One of the reasons WHY this is done is the utter stupidity of the init code of the perf NMI watchdog. Instead of checking upfront at boot whether PERF is available and functional at all, it just does this check at run time over and over when user space fiddles with the sysctl. That's broken beyond repair along with the idiotic error code dependent warn level printks and the even more silly printk rate limiting. If the init code checks whether perf works at boot time, then this mess can be more or less avoided completely. Perf does not come magically into life at runtime. Brain usage while coding is overrated. Remove the cruft and add a temporary safe guard which gets removed later. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.806708429@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 15 --------------- kernel/watchdog_hld.c | 20 +++++++------------- 2 files changed, 7 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1c185d9dd468..af000956286c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -485,21 +485,6 @@ static void watchdog(unsigned int cpu) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); - - /* - * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the - * failure path. Check for failures that can occur asynchronously - - * for example, when CPUs are on-lined - and shut down the hardware - * perf event on each CPU accordingly. - * - * The only non-obvious place this bit can be cleared is through - * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a - * pr_info here would be too noisy as it would result in a message - * every few seconds if the hardlockup was disabled but the softlockup - * enabled. - */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - watchdog_nmi_disable(cpu); } static struct smp_hotplug_thread watchdog_threads = { diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index c9586ebc2e98..7b602714ea53 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -23,6 +23,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); static unsigned long hardlockup_allcpu_dumped; +static bool hardlockup_detector_disabled; void arch_touch_nmi_watchdog(void) { @@ -178,6 +179,10 @@ int watchdog_nmi_enable(unsigned int cpu) if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) goto out; + /* A failure disabled the hardlockup detector permanently */ + if (hardlockup_detector_disabled) + return -ENODEV; + /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; @@ -206,18 +211,6 @@ int watchdog_nmi_enable(unsigned int cpu) goto out_save; } - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - /* skip displaying the same error again */ if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) return PTR_ERR(event); @@ -232,7 +225,8 @@ int watchdog_nmi_enable(unsigned int cpu) pr_err("disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); - pr_info("Shutting down hard lockup detector on all cpus\n"); + pr_info("Disabling hard lockup detector permanently\n"); + hardlockup_detector_disabled = true; return PTR_ERR(event); -- cgit v1.2.3 From 941154bd6937a710ae9193a3c733c0029e5ae7b8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:04 +0200 Subject: watchdog/hardlockup/perf: Prevent CPU hotplug deadlock The following deadlock is possible in the watchdog hotplug code: cpus_write_lock() ... takedown_cpu() smpboot_park_threads() smpboot_park_thread() kthread_park() ->park() := watchdog_disable() watchdog_nmi_disable() perf_event_release_kernel(); put_event() _free_event() ->destroy() := hw_perf_event_destroy() x86_release_hardware() release_ds_buffers() get_online_cpus() when a per cpu watchdog perf event is destroyed which drops the last reference to the PMU hardware. The cleanup code there invokes get_online_cpus() which instantly deadlocks because the hotplug percpu rwsem is write locked. To solve this add a deferring mechanism: cpus_write_lock() kthread_park() watchdog_nmi_disable(deferred) perf_event_disable(event); move_event_to_deferred(event); .... cpus_write_unlock() cleaup_deferred_events() perf_event_release_kernel() This is still properly serialized against concurrent hotplug via the cpu_add_remove_lock, which is held by the task which initiated the hotplug event. This is also used to handle event destruction when the watchdog threads are parked via other mechanisms than CPU hotplug. Analyzed-by: Peter Zijlstra Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.884469246@linutronix.de Signed-off-by: Ingo Molnar --- kernel/cpu.c | 6 ++++++ kernel/watchdog.c | 25 +++++++++++++++++++++++++ kernel/watchdog_hld.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 59 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..a96b348591df 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -734,6 +735,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, out: cpus_write_unlock(); + /* + * Do post unplug cleanup. This is still protected against + * concurrent CPU hotplug via cpu_add_remove_lock. + */ + lockup_detector_cleanup(); return ret; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index af000956286c..dd1fd59683c5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -109,8 +109,10 @@ int __weak watchdog_nmi_enable(unsigned int cpu) { return 0; } + void __weak watchdog_nmi_disable(unsigned int cpu) { + hardlockup_detector_perf_disable(); } /* @@ -193,6 +195,8 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif #endif +static void __lockup_detector_cleanup(void); + /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally @@ -631,6 +635,24 @@ static void set_sample_period(void) } #endif /* SOFTLOCKUP */ +static void __lockup_detector_cleanup(void) +{ + lockdep_assert_held(&watchdog_mutex); + hardlockup_detector_perf_cleanup(); +} + +/** + * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes + * + * Caller must not hold the cpu hotplug rwsem. + */ +void lockup_detector_cleanup(void) +{ + mutex_lock(&watchdog_mutex); + __lockup_detector_cleanup(); + mutex_unlock(&watchdog_mutex); +} + /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * @@ -665,6 +687,8 @@ static int proc_watchdog_update(void) watchdog_nmi_reconfigure(); + __lockup_detector_cleanup(); + return err; } @@ -837,6 +861,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, } watchdog_nmi_reconfigure(); + __lockup_detector_cleanup(); } mutex_unlock(&watchdog_mutex); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 7b602714ea53..94111ccb09b5 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,6 +21,8 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); +static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; static bool hardlockup_detector_disabled; @@ -239,16 +241,18 @@ out: return 0; } -void watchdog_nmi_disable(unsigned int cpu) +/** + * hardlockup_detector_perf_disable - Disable the local event + */ +void hardlockup_detector_perf_disable(void) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = this_cpu_read(watchdog_ev); if (event) { perf_event_disable(event); - per_cpu(watchdog_ev, cpu) = NULL; - - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); + this_cpu_write(watchdog_ev, NULL); + this_cpu_write(dead_event, event); + cpumask_set_cpu(smp_processor_id(), &dead_events_mask); /* watchdog_nmi_enable() expects this to be zero initially. */ if (atomic_dec_and_test(&watchdog_cpus)) @@ -256,6 +260,24 @@ void watchdog_nmi_disable(unsigned int cpu) } } +/** + * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them + * + * Called from lockup_detector_cleanup(). Serialized by the caller. + */ +void hardlockup_detector_perf_cleanup(void) +{ + int cpu; + + for_each_cpu(cpu, &dead_events_mask) { + struct perf_event *event = per_cpu(dead_event, cpu); + + per_cpu(dead_event, cpu) = NULL; + perf_event_release_kernel(event); + } + cpumask_clear(&dead_events_mask); +} + /** * hardlockup_detector_perf_stop - Globally stop watchdog events * -- cgit v1.2.3 From 01f0a02701cbcf32d22cfc9d1ab9a3f0ff2ba68c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:05 +0200 Subject: watchdog/core: Remove the park_in_progress obfuscation Commit: b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded system") tries to fix the following issue: proc_write() set_sample_period() <--- New sample period becoms visible <----- Broken starts proc_watchdog_update() watchdog_enable_all_cpus() watchdog_hrtimer_fn() update_watchdog_all_cpus() restart_timer(sample_period) watchdog_park_threads() thread->park() disable_nmi() <----- Broken ends The reason why this is broken is that the update of the watchdog threshold becomes immediately effective and visible for the hrtimer function which uses that value to rearm the timer. But the NMI/perf side still uses the old value up to the point where it is disabled. If the rate has been lowered then the NMI can run fast enough to 'detect' a hard lockup because the timer has not fired due to the longer period. The patch 'fixed' this by adding a variable: proc_write() set_sample_period() <----- Broken starts proc_watchdog_update() watchdog_enable_all_cpus() watchdog_hrtimer_fn() update_watchdog_all_cpus() restart_timer(sample_period) watchdog_park_threads() park_in_progress = 1 <----- Broken ends nmi_watchdog() if (park_in_progress) return; The only effect of this variable was to make the window where the breakage can hit small enough that it was not longer observable in testing. From a correctness point of view it is a pointless bandaid which merily papers over the root cause: the unsychronized update of the variable. Looking deeper into the related code pathes unearthed similar problems in the watchdog_start()/stop() functions. watchdog_start() perf_nmi_event_start() hrtimer_start() watchdog_stop() hrtimer_cancel() perf_nmi_event_stop() In both cases the call order is wrong because if the tasks gets preempted or the VM gets scheduled out long enough after the first call, then there is a chance that the next NMI will see a stale hrtimer interrupt count and trigger a false positive hard lockup splat. Get rid of park_in_progress so the code can be gradually deobfuscated and pruned from several layers of duct tape papering over the root cause, which has been either ignored or not understood at all. Once this is removed the underlying problem will be fixed by rewriting the proc interface to do a proper synchronized update. Address the start/stop() ordering problem as well by reverting the call order, so this part is at least correct now. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1709052038270.2393@nanos Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 37 +++++++++++++++++-------------------- kernel/watchdog_hld.c | 7 ++----- 2 files changed, 19 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index dd1fd59683c5..c290135fb415 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -136,8 +136,6 @@ void __weak watchdog_nmi_reconfigure(void) #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -322,8 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - if (!watchdog_enabled || - atomic_read(&watchdog_park_in_progress) != 0) + if (!watchdog_enabled) return HRTIMER_NORESTART; /* kick the hardlockup detector */ @@ -437,32 +434,37 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) static void watchdog_enable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - /* kick off the timer for the hardlockup detector */ + /* + * Start the timer first to prevent the NMI watchdog triggering + * before the timer has a chance to fire. + */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; + hrtimer_start(hrtimer, ns_to_ktime(sample_period), + HRTIMER_MODE_REL_PINNED); + /* Initialize timestamp */ + __touch_watchdog(); /* Enable the perf event */ watchdog_nmi_enable(cpu); - /* done here because hrtimer_start can only pin to smp_processor_id() */ - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); - - /* initialize timestamp */ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); - __touch_watchdog(); } static void watchdog_disable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); watchdog_set_prio(SCHED_NORMAL, 0); - hrtimer_cancel(hrtimer); - /* disable the perf event */ + /* + * Disable the perf event first. That prevents that a large delay + * between disabling the timer and disabling the perf event causes + * the perf NMI to detect a false positive. + */ watchdog_nmi_disable(cpu); + hrtimer_cancel(hrtimer); } static void watchdog_cleanup(unsigned int cpu, bool online) @@ -518,16 +520,11 @@ static int watchdog_park_threads(void) { int cpu, ret = 0; - atomic_set(&watchdog_park_in_progress, 1); - for_each_watchdog_cpu(cpu) { ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); if (ret) break; } - - atomic_set(&watchdog_park_in_progress, 0); - return ret; } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 94111ccb09b5..0aa191ee3d51 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -106,15 +106,12 @@ static struct perf_event_attr wd_hw_attr = { /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; - if (atomic_read(&watchdog_park_in_progress) != 0) - return; - if (__this_cpu_read(watchdog_nmi_touch) == true) { __this_cpu_write(watchdog_nmi_touch, false); return; -- cgit v1.2.3 From 2b9d7f233b835663cbc7b6b3f88dd20f61118d1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:06 +0200 Subject: watchdog/core: Clean up stub functions Having stub functions which take a full page is not helping the readablility of code. Condense them and move the doubled #ifdef variant into the SYSFS section. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.045545271@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 68 ++++++++++++++++++------------------------------------- 1 file changed, 22 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c290135fb415..af37c040436c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -125,10 +125,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu) * - sysctl_hardlockup_all_cpu_backtrace * - hardlockup_panic */ -void __weak watchdog_nmi_reconfigure(void) -{ -} - +void __weak watchdog_nmi_reconfigure(void) { } #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -136,6 +133,11 @@ void __weak watchdog_nmi_reconfigure(void) #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) +/* Global variables, exported for sysctl */ +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +int __read_mostly soft_watchdog_enabled; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -149,13 +151,9 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; } __setup("softlockup_panic=", softlockup_panic_setup); @@ -593,44 +591,13 @@ static void watchdog_disable_all_cpus(void) } } -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) -{ - return smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask); -} -#endif - -#else /* SOFTLOCKUP */ -static int watchdog_park_threads(void) -{ - return 0; -} - -static void watchdog_unpark_threads(void) -{ -} - -static int watchdog_enable_all_cpus(void) -{ - return 0; -} - -static void watchdog_disable_all_cpus(void) -{ -} - -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) -{ - return 0; -} -#endif - -static void set_sample_period(void) -{ -} -#endif /* SOFTLOCKUP */ +#else /* CONFIG_SOFTLOCKUP_DETECTOR */ +static inline int watchdog_park_threads(void) { return 0; } +static inline void watchdog_unpark_threads(void) { } +static inline int watchdog_enable_all_cpus(void) { return 0; } +static inline void watchdog_disable_all_cpus(void) { } +static inline void set_sample_period(void) { } +#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) { @@ -827,6 +794,15 @@ out: return err; } +static int watchdog_update_cpus(void) +{ + if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) { + return smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_cpumask); + } + return 0; +} + /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the -- cgit v1.2.3 From 368a7e2ce8ff0ddcdcb37eadb76530b033f6eb2d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:07 +0200 Subject: watchdog/core: Clean up the #ifdef maze The #ifdef maze in this file is horrible, group stuff at least a bit so one can figure out what belongs to what. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.139629546@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index af37c040436c..a9bdfde73e4b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -41,7 +41,6 @@ unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif #ifdef CONFIG_HARDLOCKUP_DETECTOR -/* boot commands */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -74,19 +73,21 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -#endif +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#ifdef CONFIG_SOFTLOCKUP_DETECTOR -int __read_mostly soft_watchdog_enabled; -#endif +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); +# endif /* CONFIG_SMP */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#endif struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -173,22 +174,14 @@ static int __init nosoftlockup_setup(char *str) __setup("nosoftlockup", nosoftlockup_setup); #ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; + static int __init softlockup_all_cpu_backtrace_setup(char *str) { - sysctl_softlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); + sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -#endif #endif static void __lockup_detector_cleanup(void); -- cgit v1.2.3 From 05ba3de74a3f499dcaa37b186220aaf174c95a4b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:08 +0200 Subject: watchdog/core: Split out cpumask write function Split the write part of the cpumask proc handler out into a separate helper to avoid deep indentation. This also reduces the patch complexity in the following cleanups. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.218075991@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a9bdfde73e4b..cedf45ab4d81 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -792,10 +792,29 @@ static int watchdog_update_cpus(void) if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) { return smpboot_update_cpumask_percpu_thread(&watchdog_threads, &watchdog_cpumask); + __lockup_detector_cleanup(); } return 0; } +static void proc_watchdog_cpumask_update(void) +{ + /* Remove impossible cpus to keep sysctl output clean. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); + + if (watchdog_running) { + /* + * Failure would be due to being unable to allocate a + * temporary cpumask, so we are likely not in a position to + * do much else to make things better. + */ + if (watchdog_update_cpus() != 0) + pr_err("cpumask update failed\n"); + } + + watchdog_nmi_reconfigure(); +} + /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the @@ -811,30 +830,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); - if (!err && write) { - /* Remove impossible cpus to keep sysctl output cleaner. */ - cpumask_and(&watchdog_cpumask, &watchdog_cpumask, - cpu_possible_mask); - - if (watchdog_running) { - /* - * Failure would be due to being unable to allocate - * a temporary cpumask, so we are likely not in a - * position to do much else to make things better. - */ - if (watchdog_update_cpus() != 0) - pr_err("cpumask update failed\n"); - } - - watchdog_nmi_reconfigure(); - __lockup_detector_cleanup(); - } + if (!err && write) + proc_watchdog_cpumask_update(); mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } - #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) -- cgit v1.2.3 From 0d85923c7a81719567311ba0eae8ecb2efd4c8a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:09 +0200 Subject: smpboot/threads, watchdog/core: Avoid runtime allocation smpboot_update_cpumask_threads_percpu() allocates a temporary cpumask at runtime. This is suboptimal because the call site needs more code size for proper error handling than a statically allocated temporary mask requires data size. Add static temporary cpumask. The function is globaly serialized, so no further protection required. Remove the half baken error handling in the watchdog code and get rid of the export as there are no in tree modular users of that function. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.297288838@linutronix.de Signed-off-by: Ingo Molnar --- kernel/smpboot.c | 22 +++++++--------------- kernel/watchdog.c | 21 +++++---------------- 2 files changed, 12 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1d71c051a951..ed7507b69b48 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -344,39 +344,31 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); * by the client, but only by calling this function. * This function can only be called on a registered smp_hotplug_thread. */ -int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *new) +void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *new) { struct cpumask *old = plug_thread->cpumask; - cpumask_var_t tmp; + static struct cpumask tmp; unsigned int cpu; - if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) - return -ENOMEM; - get_online_cpus(); mutex_lock(&smpboot_threads_lock); /* Park threads that were exclusively enabled on the old mask. */ - cpumask_andnot(tmp, old, new); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, old, new); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_park_thread(plug_thread, cpu); /* Unpark threads that are exclusively enabled on the new mask. */ - cpumask_andnot(tmp, new, old); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, new, old); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_unpark_thread(plug_thread, cpu); cpumask_copy(old, new); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); - - free_cpumask_var(tmp); - - return 0; } -EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cedf45ab4d81..8935a3a4c2fb 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -787,31 +787,20 @@ out: return err; } -static int watchdog_update_cpus(void) +static void watchdog_update_cpus(void) { - if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) { - return smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_cpumask); + if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR) && watchdog_running) { + smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_cpumask); __lockup_detector_cleanup(); } - return 0; } static void proc_watchdog_cpumask_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - - if (watchdog_running) { - /* - * Failure would be due to being unable to allocate a - * temporary cpumask, so we are likely not in a position to - * do much else to make things better. - */ - if (watchdog_update_cpus() != 0) - pr_err("cpumask update failed\n"); - } - + watchdog_update_cpus(); watchdog_nmi_reconfigure(); } -- cgit v1.2.3 From 2eb2527f847d1bd8d8fb9db1e8139db5d6eddb36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:10 +0200 Subject: watchdog/core: Create new thread handling infrastructure The lockup detector reconfiguration tears down all watchdog threads when the watchdog is disabled and sets them up again when its enabled. That's a pointless exercise. The watchdog threads are not consuming an insane amount of resources, so it's enough to set them up at init time and keep them in parked position when the watchdog is disabled and unpark them when it is reenabled. The smpboot thread infrastructure takes care of keeping the force parked threads in place even across cpu hotplug. Another horrible mechanism are the open coded park/unpark loops which are used for reconfiguration of the watchdog. The smpboot infrastructure allows exactly the same via smpboot_update_cpumask_thread_percpu(), which is cpu hotplug safe. Using that instead of the open coded loops allows to get rid of the hotplug locking mess in the watchdog code. Implement a clean infrastructure which allows to replace the open coded nonsense. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.377182587@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8935a3a4c2fb..b35518375fb7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -139,6 +139,9 @@ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; int __read_mostly soft_watchdog_enabled; +struct cpumask watchdog_allowed_mask __read_mostly; +static bool softlockup_threads_initialized __read_mostly; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -584,12 +587,84 @@ static void watchdog_disable_all_cpus(void) } } +static void softlockup_update_smpboot_threads(void) +{ + lockdep_assert_held(&watchdog_mutex); + + if (!softlockup_threads_initialized) + return; + + smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_allowed_mask); + __lockup_detector_cleanup(); +} + +/* Temporarily park all watchdog threads */ +static void softlockup_park_all_threads(void) +{ + cpumask_clear(&watchdog_allowed_mask); + softlockup_update_smpboot_threads(); +} + +/* + * Park threads which are not longer enabled and unpark threads which have + * been newly enabled. + */ +static void softlockup_update_threads(void) +{ + cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); + softlockup_update_smpboot_threads(); +} + +static void softlockup_reconfigure_threads(bool enabled) +{ + softlockup_park_all_threads(); + set_sample_period(); + if (enabled) + softlockup_update_threads(); +} + +/* + * Create the watchdog thread infrastructure. + * + * The threads are not unparked as watchdog_allowed_mask is empty. When + * the threads are sucessfully initialized, take the proper locks and + * unpark the threads in the watchdog_cpumask if the watchdog is enabled. + */ +static __init void softlockup_init_threads(void) +{ + int ret; + + /* + * If sysctl is off and watchdog got disabled on the command line, + * nothing to do here. + */ + if (!IS_ENABLED(CONFIG_SYSCTL) && + !(watchdog_enabled && watchdog_thresh)) + return; + + ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_allowed_mask); + if (ret) { + pr_err("Failed to initialize soft lockup detector threads\n"); + return; + } + + mutex_lock(&watchdog_mutex); + softlockup_threads_initialized = true; + softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); + mutex_unlock(&watchdog_mutex); +} + #else /* CONFIG_SOFTLOCKUP_DETECTOR */ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void set_sample_period(void) { } +static inline void softlockup_init_threads(void) { } +static inline void softlockup_update_threads(void) { } +static inline void softlockup_reconfigure_threads(bool enabled) { } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) -- cgit v1.2.3 From d57108d4f6791291e89d980e7f7a3566c32ab188 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:11 +0200 Subject: watchdog/core: Get rid of the thread teardown/setup dance The lockup detector reconfiguration tears down all watchdog threads when the watchdog is disabled and sets them up again when its enabled. That's a pointless exercise. The watchdog threads are not consuming an insane amount of resources, so it's enough to set them up at init time and keep them in parked position when the watchdog is disabled and unpark them when it is reenabled. The smpboot thread infrastructure takes care of keeping the force parked threads in place even across cpu hotplug. Aside of that the code implements the park/unpark facility of smp hotplug threads on its own, which is even more pointless. We have functionality in the smpboot thread code to do so. Use the new thread management functions and get rid of the unholy mess. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.470370113@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 190 ++++++------------------------------------------------ 1 file changed, 19 insertions(+), 171 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b35518375fb7..762d3ed82a08 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -91,13 +91,6 @@ int __read_mostly watchdog_thresh = 10; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* - * The 'watchdog_running' variable is set to 1 when the watchdog threads - * are registered/started and is set to 0 when the watchdog threads are - * unregistered/stopped, so it is an indicator whether the threads exist. - */ -static int __read_mostly watchdog_running; - /* * These functions can be overridden if an architecture implements its * own hardlockup detector. @@ -130,10 +123,6 @@ void __weak watchdog_nmi_reconfigure(void) { } #ifdef CONFIG_SOFTLOCKUP_DETECTOR -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -259,11 +248,15 @@ void touch_all_softlockup_watchdogs(void) int cpu; /* - * this is done lockless - * do we care if a 0 races with a timestamp? - * all it means is the softlock check starts one cycle later + * watchdog_mutex cannpt be taken here, as this might be called + * from (soft)interrupt context, so the access to + * watchdog_allowed_cpumask might race with a concurrent update. + * + * The watchdog time stamp can race against a concurrent real + * update as well, the only side effect might be a cycle delay for + * the softlockup check. */ - for_each_watchdog_cpu(cpu) + for_each_cpu(cpu, &watchdog_allowed_mask) per_cpu(watchdog_touch_ts, cpu) = 0; wq_watchdog_touch(-1); } @@ -303,9 +296,6 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_enable_all_cpus(void); -static void watchdog_disable_all_cpus(void); - /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -498,95 +488,6 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -/* - * park all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function returns an error if kthread_park() of a watchdog thread - * fails. In this situation, the watchdog threads of some CPUs can already - * be parked and the watchdog threads of other CPUs can still be runnable. - * Callers are expected to handle this special condition as appropriate in - * their context. - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static int watchdog_park_threads(void) -{ - int cpu, ret = 0; - - for_each_watchdog_cpu(cpu) { - ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); - if (ret) - break; - } - return ret; -} - -/* - * unpark all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static void watchdog_unpark_threads(void) -{ - int cpu; - - for_each_watchdog_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); -} - -static int update_watchdog_all_cpus(void) -{ - int ret; - - ret = watchdog_park_threads(); - if (ret) - return ret; - - watchdog_unpark_threads(); - - return 0; -} - -static int watchdog_enable_all_cpus(void) -{ - int err = 0; - - if (!watchdog_running) { - err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, - &watchdog_cpumask); - if (err) - pr_err("Failed to create watchdog threads, disabled\n"); - else - watchdog_running = 1; - } else { - /* - * Enable/disable the lockup detectors or - * change the sample period 'on the fly'. - */ - err = update_watchdog_all_cpus(); - - if (err) { - watchdog_disable_all_cpus(); - pr_err("Failed to update lockup detectors, disabled\n"); - } - } - - if (err) - watchdog_enabled = 0; - - return err; -} - -static void watchdog_disable_all_cpus(void) -{ - if (watchdog_running) { - watchdog_running = 0; - smpboot_unregister_percpu_thread(&watchdog_threads); - } -} - static void softlockup_update_smpboot_threads(void) { lockdep_assert_held(&watchdog_mutex); @@ -661,7 +562,6 @@ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } -static inline void set_sample_period(void) { } static inline void softlockup_init_threads(void) { } static inline void softlockup_update_threads(void) { } static inline void softlockup_reconfigure_threads(bool enabled) { } @@ -701,28 +601,10 @@ void lockup_detector_soft_poweroff(void) /* * Update the run state of the lockup detectors. */ -static int proc_watchdog_update(void) +static void proc_watchdog_update(void) { - int err = 0; - - /* - * Watchdog threads won't be started if they are already active. - * The 'watchdog_running' variable in watchdog_*_all_cpus() takes - * care of this. If those threads are already active, the sample - * period will be updated and the lockup detectors will be enabled - * or disabled 'on the fly'. - */ - if (watchdog_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - + softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); watchdog_nmi_reconfigure(); - - __lockup_detector_cleanup(); - - return err; - } /* @@ -778,17 +660,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, new = old & ~which; } while (cmpxchg(&watchdog_enabled, old, new) != old); - /* - * Update the run state of the lockup detectors. There is _no_ - * need to check the value returned by proc_watchdog_update() - * and to restore the previous value of 'watchdog_enabled' as - * both lockup detectors are disabled if proc_watchdog_update() - * returns an error. - */ - if (old == new) - goto out; - - err = proc_watchdog_update(); + if (old != new) + proc_watchdog_update(); } out: mutex_unlock(&watchdog_mutex); @@ -832,50 +705,28 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; + int err, old; cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); - old = ACCESS_ONCE(watchdog_thresh); + old = READ_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err || !write) - goto out; + if (!err && write && old != READ_ONCE(watchdog_thresh)) + proc_watchdog_update(); - /* - * Update the sample period. Restore on failure. - */ - new = ACCESS_ONCE(watchdog_thresh); - if (old == new) - goto out; - - set_sample_period(); - err = proc_watchdog_update(); - if (err) { - watchdog_thresh = old; - set_sample_period(); - } -out: mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; } -static void watchdog_update_cpus(void) -{ - if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR) && watchdog_running) { - smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_cpumask); - __lockup_detector_cleanup(); - } -} - static void proc_watchdog_cpumask_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - watchdog_update_cpus(); + + softlockup_update_threads(); watchdog_nmi_reconfigure(); } @@ -905,8 +756,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, void __init lockup_detector_init(void) { - set_sample_period(); - #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { pr_info("Disabling watchdog on nohz_full cores by default\n"); @@ -917,6 +766,5 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #endif - if (watchdog_enabled) - watchdog_enable_all_cpus(); + softlockup_init_threads(); } -- cgit v1.2.3 From e8b62b2dd14f8f2427856ba24cb7db922bda9bfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:12 +0200 Subject: watchdog/core: Further simplify sysctl handling Use a single function to update sysctl changes. This is not a high frequency user space interface and it's root only. Preparatory patch to cleanup the sysctl variable handling. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.549114957@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 762d3ed82a08..ca8747221e87 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -507,11 +507,8 @@ static void softlockup_park_all_threads(void) softlockup_update_smpboot_threads(); } -/* - * Park threads which are not longer enabled and unpark threads which have - * been newly enabled. - */ -static void softlockup_update_threads(void) +/* Unpark enabled threads */ +static void softlockup_unpark_threads(void) { cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); softlockup_update_smpboot_threads(); @@ -522,7 +519,7 @@ static void softlockup_reconfigure_threads(bool enabled) softlockup_park_all_threads(); set_sample_period(); if (enabled) - softlockup_update_threads(); + softlockup_unpark_threads(); } /* @@ -563,7 +560,6 @@ static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } -static inline void softlockup_update_threads(void) { } static inline void softlockup_reconfigure_threads(bool enabled) { } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -598,11 +594,11 @@ void lockup_detector_soft_poweroff(void) #ifdef CONFIG_SYSCTL -/* - * Update the run state of the lockup detectors. - */ +/* Propagate any changes to the watchdog threads */ static void proc_watchdog_update(void) { + /* Remove impossible cpus to keep sysctl output clean. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); watchdog_nmi_reconfigure(); } @@ -721,15 +717,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, return err; } -static void proc_watchdog_cpumask_update(void) -{ - /* Remove impossible cpus to keep sysctl output clean. */ - cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - - softlockup_update_threads(); - watchdog_nmi_reconfigure(); -} - /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the @@ -746,7 +733,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) - proc_watchdog_cpumask_update(); + proc_watchdog_update(); mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); -- cgit v1.2.3 From 51d4052b01ca555e0d1d5fe297b309beb6c64aa0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:14 +0200 Subject: watchdog/sysctl: Get rid of the #ifdeffery The sysctl of the nmi_watchdog file prevents writes by setting: min = max = 0 if none of the users is enabled. That involves ifdeffery and is competely non obvious. If none of the facilities is enabeld, then the file can simply be made read only. Move the ifdeffery into the header and use a constant for file permissions. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.706073616@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..539cb4e97bb8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -891,14 +891,10 @@ static struct ctl_table kern_table[] = { .procname = "nmi_watchdog", .data = &nmi_watchdog_enabled, .maxlen = sizeof (int), - .mode = 0644, + .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, .extra1 = &zero, -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) .extra2 = &one, -#else - .extra2 = &zero, -#endif }, { .procname = "watchdog_cpumask", -- cgit v1.2.3 From 7feeb9cd4f5b34476ffb9e6d58d58c5416375b19 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:15 +0200 Subject: watchdog/sysctl: Clean up sysctl variable name space Reflect that these variables are user interface related and remove the whitespace damage in the sysctl table while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.783210221@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 16 ++++++++-------- kernel/watchdog.c | 41 ++++++++++++++++++++--------------------- 2 files changed, 28 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 539cb4e97bb8..4c08ed4a379e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -871,9 +871,9 @@ static struct ctl_table kern_table[] = { #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_watchdog, .extra1 = &zero, .extra2 = &one, @@ -889,8 +889,8 @@ static struct ctl_table kern_table[] = { }, { .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, - .maxlen = sizeof (int), + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, .extra1 = &zero, @@ -906,9 +906,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", - .data = &soft_watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_soft_watchdog, .extra1 = &zero, .extra2 = &one, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ca8747221e87..baae9fc95031 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,8 +31,6 @@ static DEFINE_MUTEX(watchdog_mutex); -int __read_mostly nmi_watchdog_enabled; - #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED; @@ -40,6 +38,17 @@ unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif +int __read_mostly nmi_watchdog_user_enabled; +int __read_mostly soft_watchdog_user_enabled; +int __read_mostly watchdog_user_enabled; +int __read_mostly watchdog_thresh = 10; + +struct cpumask watchdog_allowed_mask __read_mostly; +static bool softlockup_threads_initialized __read_mostly; + +struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); + #ifdef CONFIG_HARDLOCKUP_DETECTOR /* * Should we panic when a soft-lockup or hard-lockup occurs: @@ -85,12 +94,6 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); # endif /* CONFIG_SMP */ #endif /* CONFIG_HARDLOCKUP_DETECTOR */ -int __read_mostly watchdog_user_enabled; -int __read_mostly watchdog_thresh = 10; - -struct cpumask watchdog_cpumask __read_mostly; -unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); - /* * These functions can be overridden if an architecture implements its * own hardlockup detector. @@ -113,7 +116,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu) * watchdog_nmi_reconfigure can be implemented to be notified after any * watchdog configuration change. The arch hardlockup watchdog should * respond to the following variables: - * - nmi_watchdog_enabled + * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask * - sysctl_hardlockup_all_cpu_backtrace @@ -126,10 +129,6 @@ void __weak watchdog_nmi_reconfigure(void) { } /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; -int __read_mostly soft_watchdog_enabled; - -struct cpumask watchdog_allowed_mask __read_mostly; -static bool softlockup_threads_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -606,14 +605,14 @@ static void proc_watchdog_update(void) /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * - * caller | table->data points to | 'which' contains the flag(s) - * -------------------|-----------------------|----------------------------- - * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed - * | | with SOFT_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED + * caller | table->data points to | 'which' + * -------------------|----------------------------|-------------------------- + * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED | + * | | SOFT_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3 From 6592ad2fcc8f15b4f99b36c1db7d9f65510c203b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:16 +0200 Subject: watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage Both the perf reconfiguration and the powerpc watchdog_nmi_reconfigure() need to be done in two steps. 1) Stop all NMIs 2) Read the new parameters and start NMIs Right now watchdog_nmi_reconfigure() is a combination of both. To allow a clean reconfiguration add a 'run' argument and split the functionality in powerpc. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170912194147.862865570@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index baae9fc95031..5693afd2b8ea 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -112,17 +112,25 @@ void __weak watchdog_nmi_disable(unsigned int cpu) hardlockup_detector_perf_disable(); } -/* - * watchdog_nmi_reconfigure can be implemented to be notified after any - * watchdog configuration change. The arch hardlockup watchdog should - * respond to the following variables: +/** + * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs + * @run: If false stop the watchdogs on all enabled CPUs + * If true start the watchdogs on all enabled CPUs + * + * The core call order is: + * watchdog_nmi_reconfigure(false); + * update_variables(); + * watchdog_nmi_reconfigure(true); + * + * The second call which starts the watchdogs again guarantees that the + * following variables are stable across the call. * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask - * - sysctl_hardlockup_all_cpu_backtrace - * - hardlockup_panic + * + * After the call the variables can be changed again. */ -void __weak watchdog_nmi_reconfigure(void) { } +void __weak watchdog_nmi_reconfigure(bool run) { } #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -515,10 +523,12 @@ static void softlockup_unpark_threads(void) static void softlockup_reconfigure_threads(bool enabled) { + watchdog_nmi_reconfigure(false); softlockup_park_all_threads(); set_sample_period(); if (enabled) softlockup_unpark_threads(); + watchdog_nmi_reconfigure(true); } /* @@ -559,7 +569,11 @@ static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } -static inline void softlockup_reconfigure_threads(bool enabled) { } +static void softlockup_reconfigure_threads(bool enabled) +{ + watchdog_nmi_reconfigure(false); + watchdog_nmi_reconfigure(true); +} #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) @@ -599,7 +613,6 @@ static void proc_watchdog_update(void) /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); - watchdog_nmi_reconfigure(); } /* -- cgit v1.2.3 From 091549858ed881e5f3054374af4f5b1cac681d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:17 +0200 Subject: watchdog/core: Get rid of the racy update loop Letting user space poke directly at variables which are used at run time is stupid and causes a lot of race conditions and other issues. Seperate the user variables and on change invoke the reconfiguration, which then stops the watchdogs, reevaluates the new user value and restarts the watchdogs with the new parameters. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194147.939985640@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 95 +++++++++++++++++++++++++++---------------------------- 1 file changed, 47 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5693afd2b8ea..84886319d7b0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -32,15 +32,17 @@ static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | - NMI_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 1 #else -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 0 #endif -int __read_mostly nmi_watchdog_user_enabled; -int __read_mostly soft_watchdog_user_enabled; -int __read_mostly watchdog_user_enabled; +unsigned long __read_mostly watchdog_enabled; +int __read_mostly watchdog_user_enabled = 1; +int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; +int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; struct cpumask watchdog_allowed_mask __read_mostly; @@ -65,7 +67,7 @@ unsigned int __read_mostly hardlockup_panic = */ void __init hardlockup_detector_disable(void) { - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) @@ -75,9 +77,9 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -132,6 +134,23 @@ void __weak watchdog_nmi_disable(unsigned int cpu) */ void __weak watchdog_nmi_reconfigure(bool run) { } +/** + * lockup_detector_update_enable - Update the sysctl enable bit + * + * Caller needs to make sure that the NMI/perf watchdogs are off, so this + * can't race with watchdog_nmi_disable(). + */ +static void lockup_detector_update_enable(void) +{ + watchdog_enabled = 0; + if (!watchdog_user_enabled) + return; + if (nmi_watchdog_user_enabled) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + if (soft_watchdog_user_enabled) + watchdog_enabled |= SOFT_WATCHDOG_ENABLED; +} + #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* Global variables, exported for sysctl */ @@ -160,14 +179,14 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { - watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; + soft_watchdog_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -521,12 +540,13 @@ static void softlockup_unpark_threads(void) softlockup_update_smpboot_threads(); } -static void softlockup_reconfigure_threads(bool enabled) +static void softlockup_reconfigure_threads(void) { watchdog_nmi_reconfigure(false); softlockup_park_all_threads(); set_sample_period(); - if (enabled) + lockup_detector_update_enable(); + if (watchdog_enabled && watchdog_thresh) softlockup_unpark_threads(); watchdog_nmi_reconfigure(true); } @@ -546,6 +566,8 @@ static __init void softlockup_init_threads(void) * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. */ + lockup_detector_update_enable(); + if (!IS_ENABLED(CONFIG_SYSCTL) && !(watchdog_enabled && watchdog_thresh)) return; @@ -559,7 +581,7 @@ static __init void softlockup_init_threads(void) mutex_lock(&watchdog_mutex); softlockup_threads_initialized = true; - softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); + softlockup_reconfigure_threads(); mutex_unlock(&watchdog_mutex); } @@ -569,9 +591,10 @@ static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } -static void softlockup_reconfigure_threads(bool enabled) +static void softlockup_reconfigure_threads(void) { watchdog_nmi_reconfigure(false); + lockup_detector_update_enable(); watchdog_nmi_reconfigure(true); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -612,7 +635,7 @@ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh); + softlockup_reconfigure_threads(); } /* @@ -630,48 +653,24 @@ static void proc_watchdog_update(void) static int proc_watchdog_common(int which, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; - int *watchdog_param = (int *)table->data; + int err, old, *param = table->data; cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); - /* - * If the parameter is being read return the state of the corresponding - * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the - * run state of the lockup detectors. - */ if (!write) { - *watchdog_param = (watchdog_enabled & which) != 0; + /* + * On read synchronize the userspace interface. This is a + * racy snapshot. + */ + *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } else { + old = READ_ONCE(*param); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err) - goto out; - - /* - * There is a race window between fetching the current value - * from 'watchdog_enabled' and storing the new value. During - * this race window, watchdog_nmi_enable() can sneak in and - * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. - * The 'cmpxchg' detects this race and the loop retries. - */ - do { - old = watchdog_enabled; - /* - * If the parameter value is not zero set the - * corresponding bit(s), else clear it(them). - */ - if (*watchdog_param) - new = old | which; - else - new = old & ~which; - } while (cmpxchg(&watchdog_enabled, old, new) != old); - - if (old != new) + if (!err && old != READ_ONCE(*param)) proc_watchdog_update(); } -out: mutex_unlock(&watchdog_mutex); cpu_hotplug_enable(); return err; -- cgit v1.2.3 From 178b9f7a36d2c74a38274b66dd89f53611298a19 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:18 +0200 Subject: watchdog/hardlockup/perf: Implement init time perf validation The watchdog tries to create perf events even after it figured out that perf is not functional or the requested event is not supported. That's braindead as this can be done once at init time and if not supported the NMI watchdog can be turned off unconditonally. Implement the perf hardlockup detector functionality for that. This creates a new event create function, which will replace the unholy mess of the existing one in later patches. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.019090547@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog_hld.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 0aa191ee3d51..f7e752e6e9b4 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -238,6 +238,27 @@ out: return 0; } +static int hardlockup_detector_event_create(void) +{ + unsigned int cpu = smp_processor_id(); + struct perf_event_attr *wd_attr; + struct perf_event *evt; + + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + + /* Try to register using hardware perf events */ + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + watchdog_overflow_callback, NULL); + if (IS_ERR(evt)) { + pr_info("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); + return PTR_ERR(evt); + } + this_cpu_write(watchdog_ev, evt); + return 0; +} + /** * hardlockup_detector_perf_disable - Disable the local event */ @@ -315,3 +336,19 @@ void __init hardlockup_detector_perf_restart(void) perf_event_enable(event); } } + +/** + * hardlockup_detector_perf_init - Probe whether NMI event is available at all + */ +int __init hardlockup_detector_perf_init(void) +{ + int ret = hardlockup_detector_event_create(); + + if (ret) { + pr_info("Perf NMI watchdog permanetely disabled\n"); + } else { + perf_event_release_kernel(this_cpu_read(watchdog_ev)); + this_cpu_write(watchdog_ev, NULL); + } + return ret; +} -- cgit v1.2.3 From a994a3147e4c0c9c50a46e6cace7586254975e20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:19 +0200 Subject: watchdog/hardlockup/perf: Implement init time detection of perf Use the init time detection of the perf NMI watchdog to determine whether the perf NMI watchdog is functional. If not disable it permanentely. It won't come back magically at runtime. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.099799541@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 84886319d7b0..fd8a998eb197 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,6 +44,7 @@ int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +int __read_mostly nmi_watchdog_available; struct cpumask watchdog_allowed_mask __read_mostly; static bool softlockup_threads_initialized __read_mostly; @@ -114,6 +115,12 @@ void __weak watchdog_nmi_disable(unsigned int cpu) hardlockup_detector_perf_disable(); } +/* Return 0, if a NMI watchdog is available. Error code otherwise */ +int __weak __init watchdog_nmi_probe(void) +{ + return hardlockup_detector_perf_init(); +} + /** * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs * @run: If false stop the watchdogs on all enabled CPUs @@ -145,7 +152,7 @@ static void lockup_detector_update_enable(void) watchdog_enabled = 0; if (!watchdog_user_enabled) return; - if (nmi_watchdog_user_enabled) + if (nmi_watchdog_available && nmi_watchdog_user_enabled) watchdog_enabled |= NMI_WATCHDOG_ENABLED; if (soft_watchdog_user_enabled) watchdog_enabled |= SOFT_WATCHDOG_ENABLED; @@ -692,6 +699,8 @@ int proc_watchdog(struct ctl_table *table, int write, int proc_nmi_watchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + if (!nmi_watchdog_available && write) + return -ENOTSUPP; return proc_watchdog_common(NMI_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); } @@ -764,5 +773,7 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #endif + if (!watchdog_nmi_probe()) + nmi_watchdog_available = true; softlockup_init_threads(); } -- cgit v1.2.3 From 2a1b8ee4f5665b4291e43e4a25d964c3eb2f4c32 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:20 +0200 Subject: watchdog/hardlockup/perf: Implement CPU enable replacement watchdog_nmi_enable() is an unparseable mess, Provide a clean perf specific implementation, which will be used when the existing setup/teardown mess is replaced. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.180215498@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog_hld.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index f7e752e6e9b4..99a3f22e48cc 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -259,6 +259,17 @@ static int hardlockup_detector_event_create(void) return 0; } +/** + * hardlockup_detector_perf_enable - Enable the local event + */ +void hardlockup_detector_perf_enable(void) +{ + if (hardlockup_detector_event_create()) + return; + + perf_event_enable(this_cpu_read(watchdog_ev)); +} + /** * hardlockup_detector_perf_disable - Disable the local event */ -- cgit v1.2.3 From 146c9d0e9dfdb62ed6afd43cc263efafbbfd1dcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:21 +0200 Subject: watchdog/hardlockup/perf: Use new perf CPU enable mechanism Get rid of the hodgepodge which tries to be smart about perf being unavailable and error printout rate limiting. That's all not required simply because this is never invoked when the perf NMI watchdog is not functional. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.259651788@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 4 ++- kernel/watchdog_hld.c | 88 +++------------------------------------------------ 2 files changed, 8 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fd8a998eb197..5eb11960e4a2 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -107,6 +107,7 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); */ int __weak watchdog_nmi_enable(unsigned int cpu) { + hardlockup_detector_perf_enable(); return 0; } @@ -465,7 +466,8 @@ static void watchdog_enable(unsigned int cpu) /* Initialize timestamp */ __touch_watchdog(); /* Enable the perf event */ - watchdog_nmi_enable(cpu); + if (watchdog_enabled & NMI_WATCHDOG_ENABLED) + watchdog_nmi_enable(cpu); watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 99a3f22e48cc..509bb6b59c41 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -25,7 +25,7 @@ static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; -static bool hardlockup_detector_disabled; +static unsigned int watchdog_cpus; void arch_touch_nmi_watchdog(void) { @@ -160,84 +160,6 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long firstcpu_err; -static atomic_t watchdog_cpus; - -int watchdog_nmi_enable(unsigned int cpu) -{ - struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - int firstcpu = 0; - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* A failure disabled the hardlockup detector permanently */ - if (hardlockup_detector_disabled) - return -ENODEV; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - if (atomic_inc_return(&watchdog_cpus) == 1) - firstcpu = 1; - - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - - /* save the first cpu's error for future comparision */ - if (firstcpu && IS_ERR(event)) - firstcpu_err = PTR_ERR(event); - - if (!IS_ERR(event)) { - /* only print for the first cpu initialized */ - if (firstcpu || firstcpu_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } - - /* skip displaying the same error again */ - if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Disabling hard lockup detector permanently\n"); - hardlockup_detector_disabled = true; - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; -} - static int hardlockup_detector_event_create(void) { unsigned int cpu = smp_processor_id(); @@ -267,6 +189,9 @@ void hardlockup_detector_perf_enable(void) if (hardlockup_detector_event_create()) return; + if (!watchdog_cpus++) + pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); + perf_event_enable(this_cpu_read(watchdog_ev)); } @@ -282,10 +207,7 @@ void hardlockup_detector_perf_disable(void) this_cpu_write(watchdog_ev, NULL); this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); - - /* watchdog_nmi_enable() expects this to be zero initially. */ - if (atomic_dec_and_test(&watchdog_cpus)) - firstcpu_err = 0; + watchdog_cpus--; } } -- cgit v1.2.3 From a33d44843d4574ec05bec39527d8a87b7af2072c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:22 +0200 Subject: watchdog/hardlockup/perf: Simplify deferred event destroy Now that all functionality is properly serialized against CPU hotplug, remove the extra per cpu storage which holds the disabled events for cleanup. The core makes sure that cleanup happens before new events are created. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194148.340708074@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog_hld.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 509bb6b59c41..b2931154b5f2 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,7 +21,6 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; @@ -204,8 +203,6 @@ void hardlockup_detector_perf_disable(void) if (event) { perf_event_disable(event); - this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); watchdog_cpus--; } @@ -221,9 +218,9 @@ void hardlockup_detector_perf_cleanup(void) int cpu; for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(dead_event, cpu); + struct perf_event *event = per_cpu(watchdog_ev, cpu); - per_cpu(dead_event, cpu) = NULL; + per_cpu(watchdog_ev, cpu) = NULL; perf_event_release_kernel(event); } cpumask_clear(&dead_events_mask); -- cgit v1.2.3 From ab5fe3ff38ff9653490910cc71dbbedc95a86e41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Sep 2017 21:37:23 +0200 Subject: watchdog/hardlockup: Clean up hotplug locking mess All watchdog thread related functions are delegated to the smpboot thread infrastructure, which handles serialization against CPU hotplug correctly. The sysctl interface is completely decoupled from anything which requires CPU hotplug protection. No need to protect the sysctl writes against cpu hotplug anymore. Remove it and add the now required protection to the powerpc arch_nmi_watchdog implementation. Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Ulrich Obergfell Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170912194148.418497420@linutronix.de Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5eb11960e4a2..f6ef163b72cd 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -664,7 +664,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, { int err, old, *param = table->data; - cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); if (!write) { @@ -681,7 +680,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, proc_watchdog_update(); } mutex_unlock(&watchdog_mutex); - cpu_hotplug_enable(); return err; } @@ -725,7 +723,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old; - cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); old = READ_ONCE(watchdog_thresh); @@ -735,7 +732,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, proc_watchdog_update(); mutex_unlock(&watchdog_mutex); - cpu_hotplug_enable(); return err; } @@ -750,7 +746,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; - cpu_hotplug_disable(); mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); @@ -758,7 +753,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, proc_watchdog_update(); mutex_unlock(&watchdog_mutex); - cpu_hotplug_enable(); return err; } #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3 From 2554db916586b228ce93e6f74a12fd7fe430a004 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 25 Aug 2017 09:13:54 -0700 Subject: sched/wait: Break up long wake list walk We encountered workloads that have very long wake up list on large systems. A waker takes a long time to traverse the entire wake list and execute all the wake functions. We saw page wait list that are up to 3700+ entries long in tests of large 4 and 8 socket systems. It took 0.8 sec to traverse such list during wake up. Any other CPU that contends for the list spin lock will spin for a long time. It is a result of the numa balancing migration of hot pages that are shared by many threads. Multiple CPUs waking are queued up behind the lock, and the last one queued has to wait until all CPUs did all the wakeups. The page wait list is traversed with interrupt disabled, which caused various problems. This was the original cause that triggered the NMI watch dog timer in: https://patchwork.kernel.org/patch/9800303/ . Only extending the NMI watch dog timer there helped. This patch bookmarks the waker's scan position in wake list and break the wake up walk, to allow access to the list before the waker resume its walk down the rest of the wait list. It lowers the interrupt and rescheduling latency. This patch also provides a performance boost when combined with the next patch to break up page wakeup list walk. We saw 22% improvement in the will-it-scale file pread2 test on a Xeon Phi system running 256 threads. [ v2: Merged in Linus' changes to remove the bookmark_wake_function, and simply access to flags. ] Reported-by: Kan Liang Tested-by: Kan Liang Signed-off-by: Tim Chen Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 78 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index d6afed6d0752..70701ef50465 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry } EXPORT_SYMBOL(remove_wait_queue); +/* + * Scan threshold to break wait queue walk. + * This allows a waker to take a break from holding the + * wait queue lock during the wait queue walk. + */ +#define WAITQUEUE_WALK_BREAK_CNT 64 /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just @@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, int wake_flags, void *key) +static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, int wake_flags, void *key, + wait_queue_entry_t *bookmark) { wait_queue_entry_t *curr, *next; + int cnt = 0; + + if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { + curr = list_next_entry(bookmark, entry); - list_for_each_entry_safe(curr, next, &wq_head->head, entry) { + list_del(&bookmark->entry); + bookmark->flags = 0; + } else + curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); + + if (&curr->entry == &wq_head->head) + return nr_exclusive; + + list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; - int ret = curr->func(curr, mode, wake_flags, key); + int ret; + + if (flags & WQ_FLAG_BOOKMARK) + continue; + + ret = curr->func(curr, mode, wake_flags, key); if (ret < 0) break; if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; + + if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) && + (&next->entry != &wq_head->head)) { + bookmark->flags = WQ_FLAG_BOOKMARK; + list_add_tail(&bookmark->entry, &next->entry); + break; + } + } + return nr_exclusive; +} + +static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + unsigned long flags; + wait_queue_entry_t bookmark; + + bookmark.flags = 0; + bookmark.private = NULL; + bookmark.func = NULL; + INIT_LIST_HEAD(&bookmark.entry); + + spin_lock_irqsave(&wq_head->lock, flags); + nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); + spin_unlock_irqrestore(&wq_head->lock, flags); + + while (bookmark.flags & WQ_FLAG_BOOKMARK) { + spin_lock_irqsave(&wq_head->lock, flags); + nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, + wake_flags, key, &bookmark); + spin_unlock_irqrestore(&wq_head->lock, flags); } } @@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { - unsigned long flags; - - spin_lock_irqsave(&wq_head->lock, flags); - __wake_up_common(wq_head, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&wq_head->lock, flags); + __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key); } EXPORT_SYMBOL(__wake_up); @@ -104,13 +155,13 @@ EXPORT_SYMBOL(__wake_up); */ void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(wq_head, mode, nr, 0, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, 0, key); + __wake_up_common(wq_head, mode, 1, 0, key, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -134,7 +185,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { - unsigned long flags; int wake_flags = 1; /* XXX WF_SYNC */ if (unlikely(!wq_head)) @@ -143,9 +193,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, if (unlikely(nr_exclusive != 1)) wake_flags = 0; - spin_lock_irqsave(&wq_head->lock, flags); - __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&wq_head->lock, flags); + __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); -- cgit v1.2.3 From 11a19c7b099f96d00a8dec52bfbb8475e89b6745 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 25 Aug 2017 09:13:55 -0700 Subject: sched/wait: Introduce wakeup boomark in wake_up_page_bit Now that we have added breaks in the wait queue scan and allow bookmark on scan position, we put this logic in the wake_up_page_bit function. We can have very long page wait list in large system where multiple pages share the same wait list. We break the wake up walk here to allow other cpus a chance to access the list, and not to disable the interrupts when traversing the list for too long. This reduces the interrupt and rescheduling latency, and excessive page wait queue lock hold time. [ v2: Remove bookmark_wake_function ] Signed-off-by: Tim Chen Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 70701ef50465..98feab7933c7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -165,6 +165,13 @@ void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, vo } EXPORT_SYMBOL_GPL(__wake_up_locked_key); +void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, + unsigned int mode, void *key, wait_queue_entry_t *bookmark) +{ + __wake_up_common(wq_head, mode, 1, 0, key, bookmark); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); + /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue -- cgit v1.2.3 From 439e7271dc2b63de379e37971dc2f64d71e24f8a Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Thu, 31 Aug 2017 16:37:41 -0400 Subject: livepatch: introduce shadow variable API Add exported API for livepatch modules: klp_shadow_get() klp_shadow_alloc() klp_shadow_get_or_alloc() klp_shadow_free() klp_shadow_free_all() that implement "shadow" variables, which allow callers to associate new shadow fields to existing data structures. This is intended to be used by livepatch modules seeking to emulate additions to data structure definitions. See Documentation/livepatch/shadow-vars.txt for a summary of the new shadow variable API, including a few common use cases. See samples/livepatch/livepatch-shadow-* for example modules that demonstrate shadow variables. [jkosina@suse.cz: fix __klp_shadow_get_or_alloc() comment as spotted by Josh] Signed-off-by: Joe Lawrence Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/Makefile | 2 +- kernel/livepatch/shadow.c | 277 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 kernel/livepatch/shadow.c (limited to 'kernel') diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index 2b8bdb1925da..b36ceda6488e 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o -livepatch-objs := core.o patch.o transition.o +livepatch-objs := core.o patch.o shadow.o transition.o diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c new file mode 100644 index 000000000000..67e4360521f3 --- /dev/null +++ b/kernel/livepatch/shadow.c @@ -0,0 +1,277 @@ +/* + * shadow.c - Shadow Variables + * + * Copyright (C) 2014 Josh Poimboeuf + * Copyright (C) 2014 Seth Jennings + * Copyright (C) 2017 Joe Lawrence + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +/** + * DOC: Shadow variable API concurrency notes: + * + * The shadow variable API provides a simple relationship between an + * pair and a pointer value. It is the responsibility of the + * caller to provide any mutual exclusion required of the shadow data. + * + * Once a shadow variable is attached to its parent object via the + * klp_shadow_*alloc() API calls, it is considered live: any subsequent + * call to klp_shadow_get() may then return the shadow variable's data + * pointer. Callers of klp_shadow_*alloc() should prepare shadow data + * accordingly. + * + * The klp_shadow_*alloc() API calls may allocate memory for new shadow + * variable structures. Their implementation does not call kmalloc + * inside any spinlocks, but API callers should pass GFP flags according + * to their specific needs. + * + * The klp_shadow_hash is an RCU-enabled hashtable and is safe against + * concurrent klp_shadow_free() and klp_shadow_get() operations. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +static DEFINE_HASHTABLE(klp_shadow_hash, 12); + +/* + * klp_shadow_lock provides exclusive access to the klp_shadow_hash and + * the shadow variables it references. + */ +static DEFINE_SPINLOCK(klp_shadow_lock); + +/** + * struct klp_shadow - shadow variable structure + * @node: klp_shadow_hash hash table node + * @rcu_head: RCU is used to safely free this structure + * @obj: pointer to parent object + * @id: data identifier + * @data: data area + */ +struct klp_shadow { + struct hlist_node node; + struct rcu_head rcu_head; + void *obj; + unsigned long id; + char data[]; +}; + +/** + * klp_shadow_match() - verify a shadow variable matches given + * @shadow: shadow variable to match + * @obj: pointer to parent object + * @id: data identifier + * + * Return: true if the shadow variable matches. + */ +static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj, + unsigned long id) +{ + return shadow->obj == obj && shadow->id == id; +} + +/** + * klp_shadow_get() - retrieve a shadow variable data pointer + * @obj: pointer to parent object + * @id: data identifier + * + * Return: the shadow variable data element, NULL on failure. + */ +void *klp_shadow_get(void *obj, unsigned long id) +{ + struct klp_shadow *shadow; + + rcu_read_lock(); + + hash_for_each_possible_rcu(klp_shadow_hash, shadow, node, + (unsigned long)obj) { + + if (klp_shadow_match(shadow, obj, id)) { + rcu_read_unlock(); + return shadow->data; + } + } + + rcu_read_unlock(); + + return NULL; +} +EXPORT_SYMBOL_GPL(klp_shadow_get); + +void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags, bool warn_on_exist) +{ + struct klp_shadow *new_shadow; + void *shadow_data; + unsigned long flags; + + /* Check if the shadow variable already exists */ + shadow_data = klp_shadow_get(obj, id); + if (shadow_data) + goto exists; + + /* Allocate a new shadow variable for use inside the lock below */ + new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags); + if (!new_shadow) + return NULL; + + new_shadow->obj = obj; + new_shadow->id = id; + + /* Initialize the shadow variable if data provided */ + if (data) + memcpy(new_shadow->data, data, size); + + /* Look for again under the lock */ + spin_lock_irqsave(&klp_shadow_lock, flags); + shadow_data = klp_shadow_get(obj, id); + if (unlikely(shadow_data)) { + /* + * Shadow variable was found, throw away speculative + * allocation. + */ + spin_unlock_irqrestore(&klp_shadow_lock, flags); + kfree(new_shadow); + goto exists; + } + + /* No found, so attach the newly allocated one */ + hash_add_rcu(klp_shadow_hash, &new_shadow->node, + (unsigned long)new_shadow->obj); + spin_unlock_irqrestore(&klp_shadow_lock, flags); + + return new_shadow->data; + +exists: + if (warn_on_exist) { + WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id); + return NULL; + } + + return shadow_data; +} + +/** + * klp_shadow_alloc() - allocate and add a new shadow variable + * @obj: pointer to parent object + * @id: data identifier + * @data: pointer to data to attach to parent + * @size: size of attached data + * @gfp_flags: GFP mask for allocation + * + * Allocates @size bytes for new shadow variable data using @gfp_flags + * and copies @size bytes from @data into the new shadow variable's own + * data space. If @data is NULL, @size bytes are still allocated, but + * no copy is performed. The new shadow variable is then added to the + * global hashtable. + * + * If an existing shadow variable can be found, this routine + * will issue a WARN, exit early and return NULL. + * + * Return: the shadow variable data element, NULL on duplicate or + * failure. + */ +void *klp_shadow_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags) +{ + return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true); +} +EXPORT_SYMBOL_GPL(klp_shadow_alloc); + +/** + * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable + * @obj: pointer to parent object + * @id: data identifier + * @data: pointer to data to attach to parent + * @size: size of attached data + * @gfp_flags: GFP mask for allocation + * + * Returns a pointer to existing shadow data if an shadow + * variable is already present. Otherwise, it creates a new shadow + * variable like klp_shadow_alloc(). + * + * This function guarantees that only one shadow variable exists with + * the given @id for the given @obj. It also guarantees that the shadow + * variable will be initialized by the given @data only when it did not + * exist before. + * + * Return: the shadow variable data element, NULL on failure. + */ +void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags) +{ + return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false); +} +EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc); + +/** + * klp_shadow_free() - detach and free a shadow variable + * @obj: pointer to parent object + * @id: data identifier + * + * This function releases the memory for this shadow variable + * instance, callers should stop referencing it accordingly. + */ +void klp_shadow_free(void *obj, unsigned long id) +{ + struct klp_shadow *shadow; + unsigned long flags; + + spin_lock_irqsave(&klp_shadow_lock, flags); + + /* Delete from hash */ + hash_for_each_possible(klp_shadow_hash, shadow, node, + (unsigned long)obj) { + + if (klp_shadow_match(shadow, obj, id)) { + hash_del_rcu(&shadow->node); + kfree_rcu(shadow, rcu_head); + break; + } + } + + spin_unlock_irqrestore(&klp_shadow_lock, flags); +} +EXPORT_SYMBOL_GPL(klp_shadow_free); + +/** + * klp_shadow_free_all() - detach and free all <*, id> shadow variables + * @id: data identifier + * + * This function releases the memory for all <*, id> shadow variable + * instances, callers should stop referencing them accordingly. + */ +void klp_shadow_free_all(unsigned long id) +{ + struct klp_shadow *shadow; + unsigned long flags; + int i; + + spin_lock_irqsave(&klp_shadow_lock, flags); + + /* Delete all <*, id> from hash */ + hash_for_each(klp_shadow_hash, i, shadow, node) { + if (klp_shadow_match(shadow, shadow->obj, id)) { + hash_del_rcu(&shadow->node); + kfree_rcu(shadow, rcu_head); + } + } + + spin_unlock_irqrestore(&klp_shadow_lock, flags); +} +EXPORT_SYMBOL_GPL(klp_shadow_free_all); -- cgit v1.2.3 From 5d9da759f7587c87252ef98e70bc0b4a89e4d036 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 14 Sep 2017 14:15:36 -0700 Subject: livepatch: __klp_shadow_get_or_alloc() is local to shadow.c ... therefore make it static. Fixes: 439e7271dc2 ("livepatch: introduce shadow variable API") Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/shadow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index 67e4360521f3..fdac27588d60 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -113,7 +113,7 @@ void *klp_shadow_get(void *obj, unsigned long id) } EXPORT_SYMBOL_GPL(klp_shadow_get); -void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, +static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, size_t size, gfp_t gfp_flags, bool warn_on_exist) { struct klp_shadow *new_shadow; -- cgit v1.2.3 From e67b8a685c7c984e834e3181ef4619cd7025a136 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 15 Sep 2017 14:37:38 +0100 Subject: bpf/verifier: reject BPF_ALU64|BPF_END Neither ___bpf_prog_run nor the JITs accept it. Also adds a new test case. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 477b6932c3c1..799b2451ef2d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2292,7 +2292,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || - (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || + BPF_CLASS(insn->code) == BPF_ALU64) { verbose("BPF_END uses reserved fields\n"); return -EINVAL; } -- cgit v1.2.3 From 9cb067ef8a10bb13112e4d1c0ea996ec96527422 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:03 +0200 Subject: genirq: Fix cpumask check in __irq_startup_managed() The result of cpumask_any_and() is invalid when result greater or equal nr_cpu_ids. The current check is checking for greater only. Fix it. Fixes: 761ea388e8c4 ("genirq: Handle managed irqs gracefully in irq_startup()") Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Tony Luck Cc: Chen Yu Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: stable@vger.kernel.org Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: http://lkml.kernel.org/r/20170913213152.272283444@linutronix.de --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f51b7b6d2451..6fc89fd93824 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) irqd_clr_managed_shutdown(d); - if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { + if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) { /* * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt -- cgit v1.2.3 From 582db7e0c4c2fc5bb4f932f268035883385e3692 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 18 Sep 2017 15:03:46 +0200 Subject: bpf: devmap: pass on return value of bpf_map_precharge_memlock If bpf_map_precharge_memlock in dev_map_alloc, -ENOMEM is returned regardless of the actual error produced by bpf_map_precharge_memlock. Fix it by passing on the error returned by bpf_map_precharge_memlock. Also return -EINVAL instead of -ENOMEM if the page count overflow check fails. This makes dev_map_alloc match the behavior of other bpf maps' alloc functions wrt. return values. Signed-off-by: Tobias Klauser Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 959c9a07f318..e093d9a2c4dd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err = -EINVAL; u64 cost; - int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + err = -ENOMEM; + /* A per cpu bitfield with a bit per possible net device */ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), __alignof__(unsigned long)); @@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void dev_map_free(struct bpf_map *map) -- cgit v1.2.3 From 8dd33bcb7050dd6f8c1432732f930932c9d3a33e Mon Sep 17 00:00:00 2001 From: Bo Yan Date: Mon, 18 Sep 2017 10:03:35 -0700 Subject: tracing: Erase irqsoff trace with empty write One convenient way to erase trace is "echo > trace". However, this is currently broken if the current tracer is irqsoff tracer. This is because irqsoff tracer use max_buffer as the default trace buffer. Set the max_buffer as the one to be cleared when it's the trace buffer currently in use. Link: http://lkml.kernel.org/r/1505754215-29411-1-git-send-email-byan@nvidia.com Cc: Cc: stable@vger.kernel.org Fixes: 4acd4d00f ("tracing: give easy way to clear trace buffer") Signed-off-by: Bo Yan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5360b7aec57a..a7fb136da891 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4020,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); + struct trace_buffer *trace_buf = &tr->trace_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif if (cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(trace_buf); else - tracing_reset(&tr->trace_buffer, cpu); + tracing_reset(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { -- cgit v1.2.3 From c7b3ae0bd2ca658c7a71c49901d08c590294fac9 Mon Sep 17 00:00:00 2001 From: "Ziqian SUN (Zamir)" Date: Mon, 11 Sep 2017 14:26:35 +0800 Subject: tracing: Ignore mmiotrace from kernel commandline The mmiotrace tracer cannot be enabled with ftrace=mmiotrace in kernel commandline. With this patch, noboot is added to the tracer struct, and when system boot with a tracer that has noboot=true, it will print out a warning message and continue booting. Link: http://lkml.kernel.org/r/1505111195-31942-1-git-send-email-zsun@redhat.com Signed-off-by: Ziqian SUN (Zamir) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 7 +++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_mmiotrace.c | 1 + 3 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a7fb136da891..d3ca35f38803 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5364,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t == tr->current_trace) goto out; + /* Some tracers won't work on kernel command line */ + if (system_state < SYSTEM_RUNNING && t->noboot) { + pr_warn("Tracer '%s' is not allowed on command line, ignored\n", + t->name); + goto out; + } + /* Some tracers are only allowed for the top level buffer */ if (!trace_ok_for_array(t, tr)) { ret = -EINVAL; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fb5d54d0d1b3..652c682707cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -444,6 +444,8 @@ struct tracer { #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif + /* True if tracer cannot be enabled in kernel param */ + bool noboot; }; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index cd7480d0a201..dca78fc48439 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly = .close = mmio_close, .read = mmio_read, .print_line = mmio_print_line, + .noboot = true, }; __init static int init_mmio_trace(void) -- cgit v1.2.3 From e454cf5958538666635488030046b6a84a22d447 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 18 Sep 2017 15:30:55 -0400 Subject: bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE This is a simple non-recursive delete operation. It prunes paths of empty nodes in the tree, but it does not try to further compress the tree as nodes are removed. Signed-off-by: Craig Gallek Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1b767844a76f..9d58a576b2ae 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -389,10 +389,84 @@ out: return ret; } -static int trie_delete_elem(struct bpf_map *map, void *key) +/* Called from syscall or from eBPF program */ +static int trie_delete_elem(struct bpf_map *map, void *_key) { - /* TODO */ - return -ENOSYS; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key; + struct lpm_trie_node __rcu **trim; + struct lpm_trie_node *node; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Walk the tree looking for an exact key/length match and keeping + * track of where we could begin trimming the tree. The trim-point + * is the sub-tree along the walk consisting of only single-child + * intermediate nodes and ending at a leaf node that we want to + * remove. + */ + trim = &trie->root; + node = rcu_dereference_protected( + trie->root, lockdep_is_held(&trie->lock)); + while (node) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + /* If we hit a node that has more than one child or is a valid + * prefix itself, do not remove it. Reset the root of the trim + * path to its descendant on our path. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || + (node->child[0] && node->child[1])) + trim = &node->child[next_bit]; + node = rcu_dereference_protected( + node->child[next_bit], lockdep_is_held(&trie->lock)); + } + + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + ret = -ENOENT; + goto out; + } + + trie->n_entries--; + + /* If the node we are removing is not a leaf node, simply mark it + * as intermediate and we are done. + */ + if (rcu_access_pointer(node->child[0]) || + rcu_access_pointer(node->child[1])) { + node->flags |= LPM_TREE_NODE_FLAG_IM; + goto out; + } + + /* trim should now point to the slot holding the start of a path from + * zero or more intermediate nodes to our leaf node for deletion. + */ + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { + RCU_INIT_POINTER(*trim, NULL); + trim = rcu_access_pointer(node->child[0]) ? + &node->child[0] : + &node->child[1]; + kfree_rcu(node, rcu); + } + +out: + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; } #define LPM_DATA_SIZE_MAX 256 -- cgit v1.2.3 From f454322efbf6faee695f517c6b52c4dc03cacd3e Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 22 Aug 2017 02:16:11 +0300 Subject: signal: replace sigset_to_compat() with put_compat_sigset() There are 4 callers of sigset_to_compat() in the entire kernel. One is in sparc compat rt_sigaction(2), the rest are in kernel/signal.c itself. All are followed by copy_to_user(), and all but the sparc one are under "if it's big-endian..." ifdefs. Let's transform sigset_to_compat() into put_compat_sigset() that also calls copy_to_user(). Suggested-by: Al Viro Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- kernel/compat.c | 20 ++++++++++++++------ kernel/signal.c | 27 ++++++--------------------- 2 files changed, 20 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 772e038d04d9..18dd902c9052 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -497,15 +497,23 @@ sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) } EXPORT_SYMBOL_GPL(sigset_from_compat); -void -sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) +int +put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, + unsigned int size) { + /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ +#ifdef __BIG_ENDIAN + compat_sigset_t v; switch (_NSIG_WORDS) { - case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; - case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; - case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; - case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; + case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; + case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; + case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; + case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; } + return copy_to_user(compat, &v, size) ? -EFAULT : 0; +#else + return copy_to_user(compat, set, size) ? -EFAULT : 0; +#endif } #ifdef CONFIG_NUMA diff --git a/kernel/signal.c b/kernel/signal.c index 800a18f77732..14ad6bb90dad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2621,13 +2621,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, if (error) return error; } - if (oset) { - compat_sigset_t old32; - sigset_to_compat(&old32, &old_set); - if (copy_to_user(oset, &old32, sizeof(compat_sigset_t))) - return -EFAULT; - } - return 0; + return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; #else return sys_rt_sigprocmask(how, (sigset_t __user *)nset, (sigset_t __user *)oset, sigsetsize); @@ -2669,20 +2663,11 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t set; int err = do_sigpending(&set, sigsetsize); - if (!err) { - compat_sigset_t set32; - sigset_to_compat(&set32, &set); - /* we can get here only if sigsetsize <= sizeof(set) */ - if (copy_to_user(uset, &set32, sigsetsize)) - err = -EFAULT; - } + if (!err) + err = put_compat_sigset(uset, &set, sigsetsize); return err; -#else - return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize); -#endif } #endif @@ -3451,7 +3436,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, compat_size_t, sigsetsize) { struct k_sigaction new_ka, old_ka; - compat_sigset_t mask; #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t restorer; #endif @@ -3463,6 +3447,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, if (act) { compat_uptr_t handler; + compat_sigset_t mask; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER @@ -3478,10 +3463,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - sigset_to_compat(&mask, &old_ka.sa.sa_mask); ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); - ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); + ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask, + sizeof(oact->sa_mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), -- cgit v1.2.3 From 1681634b8c70353d8ca211b9b3443889a16dafeb Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 22 Aug 2017 02:16:29 +0300 Subject: signal: simplify compat_sigpending() Remove "if it's big-endian..." ifdef in compat_sigpending(), use the endian-agnostic variant. Suggested-by: Al Viro Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- kernel/signal.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 14ad6bb90dad..f59c05fc374a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3330,15 +3330,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { -#ifdef __BIG_ENDIAN sigset_t set; int err = do_sigpending(&set, sizeof(set.sig[0])); if (!err) err = put_user(set.sig[0], set32); return err; -#else - return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32)); -#endif } #endif -- cgit v1.2.3 From 176826af03666758c656dd27f098cc889b71638b Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 22 Aug 2017 02:16:43 +0300 Subject: signal: lift sigset size check out of do_sigpending() As sigsetsize argument of do_sigpending() is not used anywhere else in that function after the check, remove this argument and move the check out of do_sigpending() into rt_sigpending() and its compat analog. Suggested-by: Al Viro Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- kernel/signal.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f59c05fc374a..9fbc574ced10 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2629,11 +2629,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, } #endif -static int do_sigpending(void *set, unsigned long sigsetsize) +static int do_sigpending(sigset_t *set) { - if (sigsetsize > sizeof(sigset_t)) - return -EINVAL; - spin_lock_irq(¤t->sighand->siglock); sigorsets(set, ¤t->pending.signal, ¤t->signal->shared_pending.signal); @@ -2653,7 +2650,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize) SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; - int err = do_sigpending(&set, sigsetsize); + int err; + + if (sigsetsize > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); if (!err && copy_to_user(uset, &set, sigsetsize)) err = -EFAULT; return err; @@ -2664,7 +2666,12 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; - int err = do_sigpending(&set, sigsetsize); + int err; + + if (sigsetsize > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); if (!err) err = put_compat_sigset(uset, &set, sigsetsize); return err; @@ -3331,7 +3338,7 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; - int err = do_sigpending(&set, sizeof(set.sig[0])); + int err = do_sigpending(&set); if (!err) err = put_user(set.sig[0], set32); return err; -- cgit v1.2.3 From b8e8e1aa9f14110da180569908bbe538c9e9dc63 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Sep 2017 20:42:54 -0400 Subject: get rid of {get,put}_compat_itimerspec() no users left Signed-off-by: Al Viro --- kernel/compat.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 18dd902c9052..d43b18031116 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, return ret; } -int get_compat_itimerspec(struct itimerspec *dst, - const struct compat_itimerspec __user *src) -{ - if (__compat_get_timespec(&dst->it_interval, &src->it_interval) || - __compat_get_timespec(&dst->it_value, &src->it_value)) - return -EFAULT; - return 0; -} - -int put_compat_itimerspec(struct compat_itimerspec __user *dst, - const struct itimerspec *src) -{ - if (__compat_put_timespec(&src->it_interval, &dst->it_interval) || - __compat_put_timespec(&src->it_value, &dst->it_value)) - return -EFAULT; - return 0; -} - int get_compat_itimerspec64(struct itimerspec64 *its, const struct compat_itimerspec __user *uits) { -- cgit v1.2.3 From 3968cf623892d710e651070243fd16af312a9797 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Sep 2017 21:45:17 -0400 Subject: get_compat_sigset() similar to put_compat_sigset() Signed-off-by: Al Viro --- kernel/compat.c | 23 ++++++++++++++++------- kernel/signal.c | 27 ++++----------------------- 2 files changed, 20 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d43b18031116..a46a4a40bb8b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -467,17 +467,26 @@ Efault: return -EFAULT; } -void -sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) +int +get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) { +#ifdef __BIG_ENDIAN + compat_sigset_t v; + if (copy_from_user(&v, compat, sizeof(compat_sigset_t))) + return -EFAULT; switch (_NSIG_WORDS) { - case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); - case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); - case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); - case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); + case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); + case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); + case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); + case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } +#else + if (copy_from_user(set, compat, sizeof(compat_sigset_t))) + return -EFAULT; +#endif + return 0; } -EXPORT_SYMBOL_GPL(sigset_from_compat); +EXPORT_SYMBOL_GPL(get_compat_sigset); int put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, diff --git a/kernel/signal.c b/kernel/signal.c index 9fbc574ced10..36a523640894 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2600,7 +2600,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t old_set = current->blocked; /* XXX: Don't preclude handling different sized sigset_t's. */ @@ -2608,13 +2607,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, return -EINVAL; if (nset) { - compat_sigset_t new32; sigset_t new_set; int error; - if (copy_from_user(&new32, nset, sizeof(compat_sigset_t))) + if (get_compat_sigset(&new_set, nset)) return -EFAULT; - - sigset_from_compat(&new_set, &new32); sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); @@ -2622,10 +2618,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, return error; } return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; -#else - return sys_rt_sigprocmask(how, (sigset_t __user *)nset, - (sigset_t __user *)oset, sigsetsize); -#endif } #endif @@ -2908,7 +2900,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct compat_timespec __user *, uts, compat_size_t, sigsetsize) { - compat_sigset_t s32; sigset_t s; struct timespec t; siginfo_t info; @@ -2917,9 +2908,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + if (get_compat_sigset(&s, uthese)) return -EFAULT; - sigset_from_compat(&s, &s32); if (uts) { if (compat_get_timespec(&t, uts)) @@ -3450,18 +3440,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, if (act) { compat_uptr_t handler; - compat_sigset_t mask; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER ret |= get_user(restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(restorer); #endif - ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); + ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask); ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; - sigset_from_compat(&new_ka.sa.sa_mask, &mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); @@ -3649,22 +3637,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t newset; - compat_sigset_t newset32; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) + if (get_compat_sigset(&newset, unewset)) return -EFAULT; - sigset_from_compat(&newset, &newset32); return sigsuspend(&newset); -#else - /* on little-endian bitmaps don't care about granularity */ - return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize); -#endif } #endif -- cgit v1.2.3 From 75df6e688ccd517e339a7c422ef7ad73045b18a2 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Sun, 17 Sep 2017 03:23:48 -0700 Subject: tracing: Fix trace_pipe behavior for instance traces When reading data from trace_pipe, tracing_wait_pipe() performs a check to see if tracing has been turned off after some data was read. Currently, this check always looks at global trace state, but it should be checking the trace instance where trace_pipe is located at. Because of this bug, cat instances/i1/trace_pipe in the following script will immediately exit instead of waiting for data: cd /sys/kernel/debug/tracing echo 0 > tracing_on mkdir -p instances/i1 echo 1 > instances/i1/tracing_on echo 1 > instances/i1/events/sched/sched_process_exec/enable cat instances/i1/trace_pipe Link: http://lkml.kernel.org/r/20170917102348.1615-1-tahsin@google.com Cc: stable@vger.kernel.org Fixes: 10246fa35d4f ("tracing: give easy way to clear trace buffer") Signed-off-by: Tahsin Erdogan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3ca35f38803..752e5daf0896 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5680,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_on() && iter->pos) + if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); -- cgit v1.2.3 From 930651a75bf1ba6893a8b8475270664ebdb6cf4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 09:15:59 -0700 Subject: bpf: do not disable/enable BH in bpf_map_free_id() syzkaller reported following splat [1] Since hard irq are disabled by the caller, bpf_map_free_id() should not try to enable/disable BH. Another solution would be to change htab_map_delete_elem() to defer the free_htab_elem() call after raw_spin_unlock_irqrestore(&b->lock, flags), but this might be not enough to cover other code paths. [1] WARNING: CPU: 1 PID: 8052 at kernel/softirq.c:161 __local_bh_enable_ip +0x1e/0x160 kernel/softirq.c:161 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 8052 Comm: syz-executor1 Not tainted 4.13.0-next-20170915+ #23 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:__local_bh_enable_ip+0x1e/0x160 kernel/softirq.c:161 RSP: 0018:ffff8801cdcd7748 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000201 RCX: 0000000000000000 RDX: 1ffffffff0b5933c RSI: 0000000000000201 RDI: ffffffff85ac99e0 RBP: ffff8801cdcd7758 R08: ffffffff85b87158 R09: 1ffff10039b9aec6 R10: ffff8801c99f24c0 R11: 0000000000000002 R12: ffffffff817b0b47 R13: dffffc0000000000 R14: ffff8801cdcd77e8 R15: 0000000000000001 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:176 [inline] _raw_spin_unlock_bh+0x30/0x40 kernel/locking/spinlock.c:207 spin_unlock_bh include/linux/spinlock.h:361 [inline] bpf_map_free_id kernel/bpf/syscall.c:197 [inline] __bpf_map_put+0x267/0x320 kernel/bpf/syscall.c:227 bpf_map_put+0x1a/0x20 kernel/bpf/syscall.c:235 bpf_map_fd_put_ptr+0x15/0x20 kernel/bpf/map_in_map.c:96 free_htab_elem+0xc3/0x1b0 kernel/bpf/hashtab.c:658 htab_map_delete_elem+0x74d/0x970 kernel/bpf/hashtab.c:1063 map_delete_elem kernel/bpf/syscall.c:633 [inline] SYSC_bpf kernel/bpf/syscall.c:1479 [inline] SyS_bpf+0x2188/0x46a0 kernel/bpf/syscall.c:1451 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: f3f1c054c288 ("bpf: Introduce bpf_map ID") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Acked-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb17e1cd1d43..25d074920a00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map) static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { + unsigned long flags; + if (do_idr_lock) - spin_lock_bh(&map_idr_lock); + spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); if (do_idr_lock) - spin_unlock_bh(&map_idr_lock); + spin_unlock_irqrestore(&map_idr_lock, flags); else __release(&map_idr_lock); } -- cgit v1.2.3 From 7c30013133964aaa2f45c17d6e9782ac6cfd7f5f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Sep 2017 00:44:21 +0200 Subject: bpf: fix ri->map_owner pointer on bpf_prog_realloc Commit 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") passed the pointer to the prog itself to be loaded into r4 prior on bpf_redirect_map() helper call, so that we can store the owner into ri->map_owner out of the helper. Issue with that is that the actual address of the prog is still subject to change when subsequent rewrites occur that require slow path in bpf_prog_realloc() to alloc more memory, e.g. from patching inlining helper functions or constant blinding. Thus, we really need to take prog->aux as the address we're holding, which also works with prog clones as they share the same aux object. Instead of then fetching aux->prog during runtime, which could potentially incur cache misses due to false sharing, we are going to just use aux for comparison on the map owner. This will also keep the patchlet of the same size, and later check in xdp_map_invalid() only accesses read-only aux pointer from the prog, it's also in the same cacheline already from prior access when calling bpf_func. Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 799b2451ef2d..b914fbe1383e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4205,7 +4205,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } if (insn->imm == BPF_FUNC_redirect_map) { - u64 addr = (unsigned long)prog; + /* Note, we cannot use prog directly as imm as subsequent + * rewrites would still change the prog pointer. The only + * stable address we can use is aux, which also works with + * prog clones during blinding. + */ + u64 addr = (unsigned long)prog->aux; struct bpf_insn r4_ld[] = { BPF_LD_IMM64(BPF_REG_4, addr), *insn, -- cgit v1.2.3 From abca5fc535a3ee0f36fb6d4468a453eaae769921 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Sep 2017 18:17:46 -0400 Subject: sched_rr_get_interval(): move compat to native, get rid of set_fs() switch to using timespec64 internally, while we are at it Signed-off-by: Al Viro --- kernel/compat.c | 16 ---------------- kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index a46a4a40bb8b..d1cee656a7ed 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -562,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, } #endif -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, - compat_pid_t, pid, - struct compat_timespec __user *, interval) -{ - struct timespec t; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); - set_fs(old_fs); - if (compat_put_timespec(&t, interval)) - return -EFAULT; - return ret; -} - /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..e74f0a5a8647 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -5098,13 +5099,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * Return: On success, 0 and the timeslice is in @interval. Otherwise, * an error code. */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) +static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; unsigned int time_slice; struct rq_flags rf; - struct timespec t; struct rq *rq; int retval; @@ -5128,15 +5127,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, task_rq_unlock(rq, p, &rf); rcu_read_unlock(); - jiffies_to_timespec(time_slice, &t); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; + jiffies_to_timespec64(time_slice, t); + return 0; out_unlock: rcu_read_unlock(); return retval; } +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_timespec64(&t, interval); + + return retval; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, + compat_pid_t, pid, + struct compat_timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = compat_put_timespec64(&t, interval); + return retval; +} +#endif + void sched_show_task(struct task_struct *p) { unsigned long free = 0; -- cgit v1.2.3 From ec9dd352d591f0c90402ec67a317c1ed4fb2e638 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Sep 2017 16:38:36 -0700 Subject: bpf: one perf event close won't free bpf program attached by another perf event This patch fixes a bug exhibited by the following scenario: 1. fd1 = perf_event_open with attr.config = ID1 2. attach bpf program prog1 to fd1 3. fd2 = perf_event_open with attr.config = ID1 4. user program closes fd2 and prog1 is detached from the tracepoint. 5. user program with fd1 does not work properly as tracepoint no output any more. The issue happens at step 4. Multiple perf_event_open can be called successfully, but only one bpf prog pointer in the tp_event. In the current logic, any fd release for the same tp_event will free the tp_event->prog. The fix is to free tp_event->prog only when the closing fd corresponds to the one which registered the program. Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e691b75b2db..6bc21e202ae4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) } } event->tp_event->prog = prog; + event->tp_event->bpf_prog_owner = event; return 0; } @@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) return; prog = event->tp_event->prog; - if (prog) { + if (prog && event->tp_event->bpf_prog_owner == event) { event->tp_event->prog = NULL; bpf_prog_put(prog); } -- cgit v1.2.3 From c4fa6c43ce4b427350cfbb659436bfe3d9e09a1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 21 Sep 2017 09:54:13 -0400 Subject: cgroup: Reinit cgroup_taskset structure before cgroup_migrate_execute() returns The cgroup_taskset structure within the larger cgroup_mgctx structure is supposed to be used once and then discarded. That is not really the case in the hotplug code path: cpuset_hotplug_workfn() - cgroup_transfer_tasks() - cgroup_migrate() - cgroup_migrate_add_task() - cgroup_migrate_execute() In this case, the cgroup_migrate() function is called multiple time with the same cgroup_mgctx structure to transfer the tasks from one cgroup to another one-by-one. The second time cgroup_migrate() is called, the cgroup_taskset will be in an incorrect state and so may cause the system to panic. For example, [ 150.888410] Faulting instruction address: 0xc0000000001db648 [ 150.888414] Oops: Kernel access of bad area, sig: 11 [#1] [ 150.888417] SMP NR_CPUS=2048 [ 150.888417] NUMA [ 150.888419] pSeries : [ 150.888545] NIP [c0000000001db648] cpuset_can_attach+0x58/0x1b0 [ 150.888548] LR [c0000000001db638] cpuset_can_attach+0x48/0x1b0 [ 150.888551] Call Trace: [ 150.888554] [c0000005f65cb940] [c0000000001db638] cpuset_can_attach+0x48/0x1b 0 (unreliable) [ 150.888559] [c0000005f65cb9a0] [c0000000001cff04] cgroup_migrate_execute+0xc4/0x4b0 [ 150.888563] [c0000005f65cba20] [c0000000001d7d14] cgroup_transfer_tasks+0x1d4/0x370 [ 150.888568] [c0000005f65cbb70] [c0000000001ddcb0] cpuset_hotplug_workfn+0x710/0x8f0 [ 150.888572] [c0000005f65cbc80] [c00000000012032c] process_one_work+0x1ac/0x4d0 [ 150.888576] [c0000005f65cbd20] [c0000000001206f8] worker_thread+0xa8/0x5b0 [ 150.888580] [c0000005f65cbdc0] [c0000000001293f8] kthread+0x168/0x1b0 [ 150.888584] [c0000005f65cbe30] [c00000000000b368] ret_from_kernel_thread+0x5c/0x74 To allow reuse of the cgroup_mgctx structure, some fields in that structure are now re-initialized at the end of cgroup_migrate_execute() function call so that the structure can be reused again in a later iteration without causing problem. This bug was introduced in the commit e595cd706982 ("group: track migration context in cgroup_mgctx") in 4.11. This commit moves the cgroup_taskset initialization out of cgroup_migrate(). The commit 10467270fb3 ("cgroup: don't call migration methods if there are no tasks to migrate") helped, but did not completely resolve the problem. Fixes: e595cd706982bff0211e6fafe5a108421e747fbc ("group: track migration context in cgroup_mgctx") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.11+ --- kernel/cgroup/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..44857278eb8a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2311,6 +2311,14 @@ out_release_tset: list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); + + /* + * Re-initialize the cgroup_taskset structure in case it is reused + * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() + * iteration. + */ + tset->nr_tasks = 0; + tset->csets = &tset->src_csets; return ret; } -- cgit v1.2.3 From 28585a832602747cbfa88ad8934013177a3aae38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Sep 2017 14:10:22 -0700 Subject: rcu: Allow for page faults in NMI handlers A number of architecture invoke rcu_irq_enter() on exception entry in order to allow RCU read-side critical sections in the exception handler when the exception is from an idle or nohz_full CPU. This works, at least unless the exception happens in an NMI handler. In that case, rcu_nmi_enter() would already have exited the extended quiescent state, which would mean that rcu_irq_enter() would (incorrectly) cause RCU to think that it is again in an extended quiescent state. This will in turn result in lockdep splats in response to later RCU read-side critical sections. This commit therefore causes rcu_irq_enter() and rcu_irq_exit() to take no action if there is an rcu_nmi_enter() in effect, thus avoiding the unscheduled return to RCU quiescent state. This in turn should make the kernel safe for on-demand RCU voyeurism. Link: http://lkml.kernel.org/r/20170922211022.GA18084@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..63bee8e1b193 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -888,6 +888,11 @@ void rcu_irq_exit(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdtp->dynticks_nesting < 1); if (rdtp->dynticks_nesting <= 1) { @@ -1020,6 +1025,11 @@ void rcu_irq_enter(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && -- cgit v1.2.3 From 9aadde91b3c035413c806619beb3e3ef6e697953 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 17:22:19 -0400 Subject: extable: Consolidate *kernel_text_address() functions The functionality between kernel_text_address() and _kernel_text_address() is the same except that _kernel_text_address() does a little more (that function needs a rename, but that can be done another time). Instead of having duplicate code in both, simply have _kernel_text_address() calls kernel_text_address() instead. This is marked for stable because there's an RCU bug that can happen if one of these functions gets called while RCU is not watching. That fix depends on this fix to keep from having to write the fix twice. Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/extable.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 38c2412401a1..a7024a494faf 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr) int __kernel_text_address(unsigned long addr) { - if (core_kernel_text(addr)) - return 1; - if (is_module_text_address(addr)) - return 1; - if (is_ftrace_trampoline(addr)) - return 1; - if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; - if (is_bpf_text_address(addr)) + if (kernel_text_address(addr)) return 1; /* * There might be init symbols in saved stacktraces. -- cgit v1.2.3 From e8cac8b1d10589be45671a5ade0926a639b543b7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 17:36:32 -0400 Subject: extable: Enable RCU if it is not watching in kernel_text_address() If kernel_text_address() is called when RCU is not watching, it can cause an RCU bug because is_module_text_address(), the is_kprobe_*insn_slot() and is_bpf_text_address() functions require the use of RCU. Only enable RCU if it is not currently watching before it calls is_module_text_address(). The use of rcu_nmi_enter() is used to enable RCU because kernel_text_address() can happen pretty much anywhere (like an NMI), and even from within an NMI. It is called via save_stack_trace() that can be called by any WARN() or tracing function, which can happen while RCU is not watching (for example, going to or coming from idle, or during CPU take down or bring up). Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/extable.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index a7024a494faf..9aa1cc41ecf7 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -119,17 +119,42 @@ int __kernel_text_address(unsigned long addr) int kernel_text_address(unsigned long addr) { + bool no_rcu; + int ret = 1; + if (core_kernel_text(addr)) return 1; + + /* + * If a stack dump happens while RCU is not watching, then + * RCU needs to be notified that it requires to start + * watching again. This can happen either by tracing that + * triggers a stack trace, or a WARN() that happens during + * coming back from idle, or cpu on or offlining. + * + * is_module_text_address() as well as the kprobe slots + * and is_bpf_text_address() require RCU to be watching. + */ + no_rcu = !rcu_is_watching(); + + /* Treat this like an NMI as it can happen anywhere */ + if (no_rcu) + rcu_nmi_enter(); + if (is_module_text_address(addr)) - return 1; + goto out; if (is_ftrace_trampoline(addr)) - return 1; + goto out; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; + goto out; if (is_bpf_text_address(addr)) - return 1; - return 0; + goto out; + ret = 0; +out: + if (no_rcu) + rcu_nmi_exit(); + + return ret; } /* -- cgit v1.2.3 From 15516c89acce948debc4c598e03c3fee53045797 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Sep 2017 13:00:21 -0400 Subject: tracing: Remove RCU work arounds from stack tracer Currently the stack tracer calls rcu_irq_enter() to make sure RCU is watching when it records a stack trace. But if the stack tracer is triggered while tracing inside of a rcu_irq_enter(), calling rcu_irq_enter() unconditionally can be problematic. The reason for having rcu_irq_enter() in the first place has been fixed from within the saving of the stack trace code, and there's no reason for doing it in the stack tracer itself. Just remove it. Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Suggested-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a4df67cbc711..49cb41412eec 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; - /* - * There's a slight chance that we are tracing inside the - * RCU infrastructure, and rcu_irq_enter() will not work - * as expected. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); - /* - * RCU may not be watching, make it see us. - * The stack trace code uses rcu_sched. - */ - rcu_irq_enter(); - /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; @@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack) } out: - rcu_irq_exit(); arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } -- cgit v1.2.3 From c74aef2d06a9f59cece89093eecc552933cba72a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 17:48:06 +0200 Subject: futex: Fix pi_state->owner serialization There was a reported suspicion about a race between exit_pi_state_list() and put_pi_state(). The same report mentioned the comment with put_pi_state() said it should be called with hb->lock held, and it no longer is in all places. As it turns out, the pi_state->owner serialization is indeed broken. As per the new rules: 734009e96d19 ("futex: Change locking rules") pi_state->owner should be serialized by pi_state->pi_mutex.wait_lock. For the sites setting pi_state->owner we already hold wait_lock (where required) but exit_pi_state_list() and put_pi_state() were not and raced on clearing it. Fixes: 734009e96d19 ("futex: Change locking rules") Reported-by: Gratian Crisan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: dvhart@infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170922154806.jd3ffltfk24m4o4y@hirez.programming.kicks-ass.net --- kernel/futex.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3d38eaf05492..0518a0bfc746 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state) /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. - * - * Must be called with the hb lock held. */ static void put_pi_state(struct futex_pi_state *pi_state) { @@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + struct task_struct *owner; - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + owner = pi_state->owner; + if (owner) { + raw_spin_lock(&owner->pi_lock); + list_del_init(&pi_state->list); + raw_spin_unlock(&owner->pi_lock); + } + rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); } - if (current->pi_state_cache) + if (current->pi_state_cache) { kfree(pi_state); - else { + } else { /* * pi_state->list is already empty. * clear pi_state->owner. @@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - - raw_spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); continue; } @@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - raw_spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock(&curr->pi_lock); get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); @@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ pi_state->owner = p; raw_spin_unlock_irq(&p->pi_lock); @@ -2878,6 +2888,7 @@ retry: raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); + /* drops pi_state->pi_mutex.wait_lock */ ret = wake_futex_pi(uaddr, uval, pi_state); put_pi_state(pi_state); -- cgit v1.2.3 From 2827a418ca1b23e432e62c9b3d0e7cf3255dfe88 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Tue, 19 Sep 2017 22:04:12 +0200 Subject: genirq: Check __free_irq() return value for NULL __free_irq() can return a NULL irqaction for example when trying to free already-free IRQ, but the callsite unconditionally dereferences the returned pointer. Fix this by adding a check and return NULL. Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170919200412.GA29985@gmail.com --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 573dc52b0806..d00132b5c325 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id) #endif action = __free_irq(irq, dev_id); + + if (!action) + return NULL; + devname = action->name; kfree(action); return devname; -- cgit v1.2.3 From 5acb3cc2c2e9d3020a4fee43763c6463767f1572 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 20 Sep 2017 13:12:20 -0600 Subject: blktrace: Fix potential deadlock between delete & sysfs ops The lockdep code had reported the following unsafe locking scenario: CPU0 CPU1 ---- ---- lock(s_active#228); lock(&bdev->bd_mutex/1); lock(s_active#228); lock(&bdev->bd_mutex); *** DEADLOCK *** The deadlock may happen when one task (CPU1) is trying to delete a partition in a block device and another task (CPU0) is accessing tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that partition. The s_active isn't an actual lock. It is a reference count (kn->count) on the sysfs (kernfs) file. Removal of a sysfs file, however, require a wait until all the references are gone. The reference count is treated like a rwsem using lockdep instrumentation code. The fact that a thread is in the sysfs callback method or in the ioctl call means there is a reference to the opended sysfs or device file. That should prevent the underlying block structure from being removed. Instead of using bd_mutex in the block_device structure, a new blk_trace_mutex is now added to the request_queue structure to protect access to the blk_trace structure. Suggested-by: Christoph Hellwig Signed-off-by: Waiman Long Acked-by: Steven Rostedt (VMware) Fix typo in patch subject line, and prune a comment detailing how the code used to work. Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2a685b45b73b..45a3928544ce 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start) } EXPORT_SYMBOL_GPL(blk_trace_startstop); +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ + /** * blk_trace_ioctl: - handle the ioctls associated with tracing * @bdev: the block device @@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); switch (cmd) { case BLKTRACESETUP: @@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); return ret; } @@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!q->blk_trace); @@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: @@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { if (value) @@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: -- cgit v1.2.3 From cfb766da54d98ceb145e1bb0bd11c559569dcbfc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:04 -0700 Subject: sched/cputime: Expose cputime_adjust() Will be used by basic cgroup resource stat reporting later. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner --- kernel/sched/cputime.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 14d2dbf97c53..8839b6e8a104 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -585,9 +585,8 @@ drop_precision: * * Assuming that rtime_i+1 >= rtime_i. */ -static void cputime_adjust(struct task_cputime *curr, - struct prev_cputime *prev, - u64 *ut, u64 *st) +void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) { u64 rtime, stime, utime; unsigned long flags; -- cgit v1.2.3 From d2cc5ed6949085cfba30ec5228816cf6eb1d02b9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:04 -0700 Subject: cpuacct: Introduce cgroup_account_cputime[_field]() Introduce cgroup_account_cputime[_field]() which wrap cpuacct_charge() and cgroup_account_field(). This doesn't introduce any functional changes and will be used to add cgroup basic resource accounting. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar --- kernel/sched/cpuacct.h | 17 ----------------- kernel/sched/cputime.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 2 +- kernel/sched/stop_task.c | 2 +- 7 files changed, 6 insertions(+), 23 deletions(-) delete mode 100644 kernel/sched/cpuacct.h (limited to 'kernel') diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h deleted file mode 100644 index ba72807c73d4..000000000000 --- a/kernel/sched/cpuacct.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifdef CONFIG_CGROUP_CPUACCT - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); - -#else - -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ -} - -static inline void -cpuacct_account_field(struct task_struct *tsk, int index, u64 val) -{ -} - -#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8839b6e8a104..e01b699bbd5b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __this_cpu_add(kernel_cpustat.cpustat[index], tmp); - cpuacct_account_field(p, index, tmp); + cgroup_account_cputime_field(p, index, tmp); } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0191ec7667c3..abd913c1b99e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1143,7 +1143,7 @@ static void update_curr_dl(struct rq *rq) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..0ae69af95b8b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -851,7 +851,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cpuacct_charge(curtask, delta_exec); + cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0af5ca9e3e3f..fdc2c5d1f82e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -979,7 +979,7 @@ static void update_curr_rt(struct rq *rq) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14db76cd496f..f0b98f978843 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef CONFIG_PARAVIRT #include @@ -36,7 +37,6 @@ #include "cpupri.h" #include "cpudeadline.h" -#include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 9f69fb630853..ec0bb5ab9024 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -71,7 +71,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) -- cgit v1.2.3 From 041cd640b2f3c5607171c59d8712b503659d21f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:05 -0700 Subject: cgroup: Implement cgroup2 basic CPU usage accounting In cgroup1, while cpuacct isn't actually controlling any resources, it is a separate controller due to combination of two factors - 1. enabling cpu controller has significant side effects, and 2. we have to pick one of the hierarchies to account CPU usages on. cpuacct controller is effectively used to designate a hierarchy to track CPU usages on. cgroup2's unified hierarchy removes the second reason and we can account basic CPU usages by default. While we can use cpuacct for this purpose, both its interface and implementation leave a lot to be desired - it collects and exposes two sources of truth which don't agree with each other and some of the exposed statistics don't make much sense. Also, it propagates all the way up the hierarchy on each accounting event which is unnecessary. This patch adds basic resource accounting mechanism to cgroup2's unified hierarchy and accounts CPU usages using it. * All accountings are done per-cpu and don't propagate immediately. It just bumps the per-cgroup per-cpu counters and links to the parent's updated list if not already on it. * On a read, the per-cpu counters are collected into the global ones and then propagated upwards. Only the per-cpu counters which have changed since the last read are propagated. * CPU usage stats are collected and shown in "cgroup.stat" with "cpu." prefix. Total usage is collected from scheduling events. User/sys breakdown is sourced from tick sampling and adjusted to the usage using cputime_adjust(). This keeps the accounting side hot path O(1) and per-cpu and the read side O(nr_updated_since_last_read). v2: Minor changes and documentation updates as suggested by Waiman and Roman. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner Cc: Waiman Long Cc: Roman Gushchin --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/cgroup-internal.h | 8 + kernel/cgroup/cgroup.c | 24 ++- kernel/cgroup/stat.c | 334 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 365 insertions(+), 3 deletions(-) create mode 100644 kernel/cgroup/stat.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ce693ccb8c58..0acee616e06c 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,4 +1,4 @@ -obj-y := cgroup.o namespace.o cgroup-v1.o +obj-y := cgroup.o stat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5151ff256c29..fa642c99586a 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -199,6 +199,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); +/* + * stat.c + */ +void cgroup_stat_flush(struct cgroup *cgrp); +int cgroup_stat_init(struct cgroup *cgrp); +void cgroup_stat_exit(struct cgroup *cgrp); +void cgroup_stat_boot(void); + /* * namespace.c */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..d036625556c9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS +static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root; +struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); + cgroup_stat_show_cputime(seq, "cpu."); + return 0; } @@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_exit(cgrp); kfree(cgrp); } else { /* @@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ trace_cgroup_release(cgrp); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_flush(cgrp); + for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; @@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_free_cgrp; + if (cgroup_on_dfl(parent)) { + ret = cgroup_stat_init(cgrp); + if (ret) + goto out_cancel_ref; + } + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. @@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; - goto out_cancel_ref; + goto out_stat_exit; } init_cgroup_housekeeping(cgrp); @@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_stat_exit: + if (cgroup_on_dfl(parent)) + cgroup_stat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5148,6 +5166,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); + cgroup_stat_boot(); + /* * The latency of the synchronize_sched() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c new file mode 100644 index 000000000000..9cce79e89320 --- /dev/null +++ b/kernel/cgroup/stat.c @@ -0,0 +1,334 @@ +#include "cgroup-internal.h" + +#include + +static DEFINE_MUTEX(cgroup_stat_mutex); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); + +static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->cpu_stat, cpu); +} + +/** + * cgroup_cpu_stat_updated - keep track of updated cpu_stat + * @cgrp: target cgroup + * @cpu: cpu on which cpu_stat was updated + * + * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching + * cpu_stat->updated_children list. See the comment on top of + * cgroup_cpu_stat definition for details. + */ +static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_cpu_stat(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (cstat->updated_next) + break; + + cstat->updated_next = pcstat->updated_children; + pcstat->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + +/** + * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_cpu_stat_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_cpu_stat *cstat; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + cstat = cgroup_cpu_stat(pos, cpu); + if (cstat->updated_children == pos) + break; + pos = cstat->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && cstat->updated_next) { + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + struct cgroup_cpu_stat *ncstat; + struct cgroup **nextp; + + nextp = &pcstat->updated_children; + while (true) { + ncstat = cgroup_cpu_stat(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &ncstat->updated_next; + } + + *nextp = cstat->updated_next; + cstat->updated_next = NULL; + } + + return pos; +} + +static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, + struct cgroup_stat *src_stat) +{ + dst_stat->cputime.utime += src_stat->cputime.utime; + dst_stat->cputime.stime += src_stat->cputime.stime; + dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; +} + +static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct task_cputime *last_cputime = &cstat->last_cputime; + struct task_cputime cputime; + struct cgroup_stat delta; + unsigned seq; + + lockdep_assert_held(&cgroup_stat_mutex); + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&cstat->sync); + cputime = cstat->cputime; + } while (__u64_stats_fetch_retry(&cstat->sync, seq)); + + /* accumulate the deltas to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_stat_accumulate(&delta, &cgrp->pending_stat); + memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_stat_accumulate(&cgrp->stat, &delta); + if (parent) + cgroup_stat_accumulate(&parent->pending_stat, &delta); +} + +/* see cgroup_stat_flush() */ +static void cgroup_stat_flush_locked(struct cgroup *cgrp) +{ + int cpu; + + lockdep_assert_held(&cgroup_stat_mutex); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *pos = NULL; + + raw_spin_lock_irq(cpu_lock); + while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) + cgroup_cpu_stat_flush_one(pos, cpu); + raw_spin_unlock_irq(cpu_lock); + } +} + +/** + * cgroup_stat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + */ +void cgroup_stat_flush(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_stat_mutex); + cgroup_stat_flush_locked(cgrp); + mutex_unlock(&cgroup_stat_mutex); +} + +static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) +{ + struct cgroup_cpu_stat *cstat; + + cstat = get_cpu_ptr(cgrp->cpu_stat); + u64_stats_update_begin(&cstat->sync); + return cstat; +} + +static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, + struct cgroup_cpu_stat *cstat) +{ + u64_stats_update_end(&cstat->sync); + cgroup_cpu_stat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(cstat); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + cstat->cputime.sum_exec_runtime += delta_exec; + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + cstat->cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + cstat->cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + mutex_lock(&cgroup_stat_mutex); + + cgroup_stat_flush_locked(cgrp); + + usage = cgrp->stat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, + &utime, &stime); + + mutex_unlock(&cgroup_stat_mutex); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "%susage_usec %llu\n" + "%suser_usec %llu\n" + "%ssystem_usec %llu\n", + prefix, usage, prefix, utime, prefix, stime); +} + +int cgroup_stat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has cpu_stat preallocated */ + if (!cgrp->cpu_stat) { + cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); + if (!cgrp->cpu_stat) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) + cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + + prev_cputime_init(&cgrp->stat.prev_cputime); + + return 0; +} + +void cgroup_stat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_stat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + if (WARN_ON_ONCE(cstat->updated_children != cgrp) || + WARN_ON_ONCE(cstat->updated_next)) + return; + } + + free_percpu(cgrp->cpu_stat); + cgrp->cpu_stat = NULL; +} + +void __init cgroup_stat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); + + BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); +} -- cgit v1.2.3 From 115ef3b7e61ac64e32827611a127002672ed3725 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 25 Sep 2017 20:21:54 +0200 Subject: watchdog/hardlockup/perf: Cure UP damage for_each_cpu() unintuitively reports CPU0 as set independend of the actual cpumask content on UP kernels. That leads to a NULL pointer dereference when the cleanup function is invoked and there is no event to clean up. Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner --- kernel/watchdog_hld.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index b2931154b5f2..204a8cadb717 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -220,8 +220,13 @@ void hardlockup_detector_perf_cleanup(void) for_each_cpu(cpu, &dead_events_mask) { struct perf_event *event = per_cpu(watchdog_ev, cpu); + /* + * Required because for_each_cpu() reports unconditionally + * CPU0 as set on UP kernels. Sigh. + */ + if (event) + perf_event_release_kernel(event); per_cpu(watchdog_ev, cpu) = NULL; - perf_event_release_kernel(event); } cpumask_clear(&dead_events_mask); } -- cgit v1.2.3 From e0b477941d130576d9daf31160dab6e34335b10a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:04 +0200 Subject: genirq/debugfs: Show debug information for all irq descriptors Currently the debugfs shows only information about actively used interrupts like /proc/irq/ does. That's fine for most cases, but not helpful when internals of allocated, but unused interrupt descriptors have to debugged. It's also useful to provide information about all descriptors so leaks can be debugged in a simpler way. Move the debugfs registration to the descriptor allocation code. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.355525908@linutronix.de --- kernel/irq/irqdesc.c | 1 + kernel/irq/manage.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 82afb7ed369f..0f6805b75fc8 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -462,6 +462,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, goto err; irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); + irq_add_debugfs_entry(start + i, desc); } bitmap_set(allocated_irqs, start, cnt); return start; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d00132b5c325..0e8b48315f3c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1400,7 +1400,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) wake_up_process(new->secondary->thread); register_irq_proc(irq, desc); - irq_add_debugfs_entry(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; -- cgit v1.2.3 From 07557ccb8c83f315e409ddee181e9ffbf87c6ad1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:05 +0200 Subject: genirq/msi: Capture device name for debugfs For debugging the allocation of unused or potentially leaked interrupt descriptor it's helpful to have some information about the site which allocated them. In case of MSI this is simple because the caller hands the device struct pointer into the domain allocation function. Duplicate the device name and show it in the debugfs entry of the interrupt descriptor. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.433038426@linutronix.de --- kernel/irq/debugfs.c | 10 ++++++++++ kernel/irq/internals.h | 5 +++++ kernel/irq/msi.c | 6 +++++- 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c3fdb36dec30..b7d1023b9b22 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -149,6 +149,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, ARRAY_SIZE(irqdesc_states)); @@ -226,6 +227,15 @@ static const struct file_operations dfs_irq_ops = { .release = single_release, }; +void irq_debugfs_copy_devname(int irq, struct device *dev) +{ + struct irq_desc *desc = irq_to_desc(irq); + const char *name = dev_name(dev); + + if (name) + desc->dev_name = kstrdup(name, GFP_KERNEL); +} + void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) { char name [10]; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a4aa39009f0d..cfaec2669093 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -443,7 +443,9 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); static inline void irq_remove_debugfs_entry(struct irq_desc *desc) { debugfs_remove(desc->debugfs_file); + kfree(desc->dev_name); } +void irq_debugfs_copy_devname(int irq, struct device *dev); # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else @@ -458,4 +460,7 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) static inline void irq_remove_debugfs_entry(struct irq_desc *d) { } +static inline void irq_debugfs_copy_devname(int irq, struct device *dev) +{ +} #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */ diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3fa4bd59f569..94ecc0293844 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -16,6 +16,8 @@ #include #include +#include "internals.h" + /** * alloc_msi_entry - Allocate an initialize msi_entry * @dev: Pointer to the device for which this is allocated @@ -373,8 +375,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, return ret; } - for (i = 0; i < desc->nvec_used; i++) + for (i = 0; i < desc->nvec_used; i++) { irq_set_msi_desc_off(virq, i, desc); + irq_debugfs_copy_devname(virq + i, dev); + } } if (ops->msi_finish) -- cgit v1.2.3 From c3e7239a7f43ce1ff407df5f5944bf0d42dc21bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:06 +0200 Subject: irqdomain/debugfs: Provide domain specific debug callback Some interrupt domains like the X86 vector domain has special requirements for debugging, like showing the vector usage on the CPUs. Add a callback to the irqdomain ops which can be filled in by domains which require it and add conditional invocations to the irqdomain and the per irq debug files. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.512937505@linutronix.de --- kernel/irq/debugfs.c | 2 ++ kernel/irq/irqdomain.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index b7d1023b9b22..7f608ac39653 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -81,6 +81,8 @@ irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind) data->domain ? data->domain->name : ""); seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq); irq_debug_show_chip(m, data, ind + 1); + if (data->domain && data->domain->ops && data->domain->ops->debug_show) + data->domain->ops->debug_show(m, NULL, data, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!data->parent_data) return; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..1a75a00ee01f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1810,6 +1810,8 @@ irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) d->revmap_size + d->revmap_direct_max_irq); seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); + if (d->ops && d->ops->debug_show) + d->ops->debug_show(m, d, NULL, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!d->parent) return; -- cgit v1.2.3 From 457f6d35072f395508255ef09fd08f824382cf85 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:07 +0200 Subject: genirq: Make state consistent for !IRQ_DOMAIN_HIERARCHY In the !IRQ_DOMAIN_HIERARCHY cas the activation stubs are not setting/clearing the activation status bits. This is not a problem at the moment, but upcoming changes require a correct status. Add the set/clear incovations to the stub functions and move them to the core internal header to avoid duplication and visibility outside the core. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.591985591@linutronix.de --- kernel/irq/internals.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index cfaec2669093..72b8da2a7c39 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -436,6 +436,17 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ +#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) +static inline void irq_domain_activate_irq(struct irq_data *data) +{ + irqd_set_activated(data); +} +static inline void irq_domain_deactivate_irq(struct irq_data *data) +{ + irqd_clr_activated(data); +} +#endif + #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include -- cgit v1.2.3 From 239306fee8a59beb728faf8f42b13b2250b24841 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:08 +0200 Subject: genirq: Set managed shut down flag at init Managed interrupts should start up in managed shutdown mode. Set the status flag when initialising the irq descriptor. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.669687742@linutronix.de --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0f6805b75fc8..982a3576fb01 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -448,7 +448,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, } } - flags = affinity ? IRQD_AFFINITY_MANAGED : 0; + flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0; mask = NULL; for (i = 0; i < cnt; i++) { -- cgit v1.2.3 From c942cee46bba761ce97ee6d4fc71892e064e8628 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:09 +0200 Subject: genirq: Separate activation and startup Activation of an interrupt and startup are currently a combo functionlity. That works so far, but upcoming changes require a strict separation because the activation can fail in future. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.754334077@linutronix.de --- kernel/irq/autoprobe.c | 2 +- kernel/irq/chip.c | 30 ++++++++++++++++++++++++------ kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 17 ++++++++++++++++- 4 files changed, 43 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index d30a0dd5cc02..6608d03efb23 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE); + irq_activate_and_startup(desc, IRQ_NORESEND); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6fc89fd93824..37dd34d922f4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -207,20 +207,19 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt * installment or irq auto probing should not happen on - * managed irqs either. Emit a warning, break the affinity - * and start it up as a normal interrupt. + * managed irqs either. */ if (WARN_ON_ONCE(force)) - return IRQ_STARTUP_NORMAL; + return IRQ_STARTUP_ABORT; /* * The interrupt was requested, but there is no online CPU * in it's affinity mask. Put it into managed shutdown * state and let the cpu hotplug mechanism start it up once * a CPU in the mask becomes available. */ - irqd_set_managed_shutdown(d); return IRQ_STARTUP_ABORT; } + irq_domain_activate_irq(d); return IRQ_STARTUP_MANAGED; } #else @@ -236,7 +235,9 @@ static int __irq_startup(struct irq_desc *desc) struct irq_data *d = irq_desc_get_irq_data(desc); int ret = 0; - irq_domain_activate_irq(d); + /* Warn if this interrupt is not activated but try nevertheless */ + WARN_ON_ONCE(!irqd_is_activated(d)); + if (d->chip->irq_startup) { ret = d->chip->irq_startup(d); irq_state_clr_disabled(desc); @@ -269,6 +270,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) irq_set_affinity_locked(d, aff, false); break; case IRQ_STARTUP_ABORT: + irqd_set_managed_shutdown(d); return 0; } } @@ -278,6 +280,22 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) return ret; } +int irq_activate(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + + if (!irqd_affinity_is_managed(d)) + irq_domain_activate_irq(d); + return 0; +} + +void irq_activate_and_startup(struct irq_desc *desc, bool resend) +{ + if (WARN_ON(irq_activate(desc))) + return; + irq_startup(desc, resend, IRQ_START_FORCE); +} + static void __irq_disable(struct irq_desc *desc, bool mask); void irq_shutdown(struct irq_desc *desc) @@ -953,7 +971,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; - irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); + irq_activate_and_startup(desc, IRQ_RESEND); } } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 72b8da2a7c39..8bd131e4c944 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -74,6 +74,8 @@ extern void __enable_irq(struct irq_desc *desc); #define IRQ_START_FORCE true #define IRQ_START_COND false +extern int irq_activate(struct irq_desc *desc); +extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0e8b48315f3c..e667912d0e9c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -519,7 +519,7 @@ void __enable_irq(struct irq_desc *desc) * time. If it was already started up, then irq_startup() * will invoke irq_enable() under the hood. */ - irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; } default: @@ -1325,6 +1325,21 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_unlock; } + /* + * Activate the interrupt. That activation must happen + * independently of IRQ_NOAUTOEN. request_irq() can fail + * and the callers are supposed to handle + * that. enable_irq() of an interrupt requested with + * IRQ_NOAUTOEN is not supposed to fail. The activation + * keeps it in shutdown mode, it merily associates + * resources if necessary and if that's not possible it + * fails. Interrupts which are in managed shutdown mode + * will simply ignore that activation request. + */ + ret = irq_activate(desc); + if (ret) + goto out_unlock; + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_ONESHOT | IRQS_WAITING); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); -- cgit v1.2.3 From 72491643469aab736536ae71dcd199b19dabd891 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:10 +0200 Subject: genirq/irqdomain: Update irq_domain_ops.activate() signature The irq_domain_ops.activate() callback has no return value and no way to tell the function that the activation is early. The upcoming changes to support a reservation scheme which allows to assign interrupt vectors on x86 only when the interrupt is actually requested requires: - A return value, so activation can fail at request_irq() time - Information that the activate invocation is early, i.e. before request_irq(). Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.848490816@linutronix.de --- kernel/irq/irqdomain.c | 2 +- kernel/irq/msi.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1a75a00ee01f..1423f8a6c75d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1690,7 +1690,7 @@ static void __irq_domain_activate_irq(struct irq_data *irq_data) if (irq_data->parent_data) __irq_domain_activate_irq(irq_data->parent_data); if (domain->ops->activate) - domain->ops->activate(domain, irq_data); + domain->ops->activate(domain, irq_data, false); } } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 94ecc0293844..6155fff9f647 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -102,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data, return ret; } -static void msi_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int msi_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool early) { struct msi_msg msg; BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); irq_chip_write_msi_msg(irq_data, &msg); + return 0; } static void msi_domain_deactivate(struct irq_domain *domain, -- cgit v1.2.3 From bb9b428a5c832d7abb494fbabac37c515c01c6c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:11 +0200 Subject: genirq/irqdomain: Allow irq_domain_activate_irq() to fail Allow irq_domain_activate_irq() to fail. This is required to support a reservation and late vector assignment scheme. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213152.933882227@linutronix.de --- kernel/irq/chip.c | 9 +++++++-- kernel/irq/internals.h | 3 ++- kernel/irq/irqdomain.c | 40 +++++++++++++++++++++++++--------------- kernel/irq/msi.c | 19 +++++++++++++++++-- 4 files changed, 51 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 37dd34d922f4..cd5b3eb38082 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -219,7 +219,12 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) */ return IRQ_STARTUP_ABORT; } - irq_domain_activate_irq(d); + /* + * Managed interrupts have reserved resources, so this should not + * happen. + */ + if (WARN_ON(irq_domain_activate_irq(d))) + return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } #else @@ -285,7 +290,7 @@ int irq_activate(struct irq_desc *desc) struct irq_data *d = irq_desc_get_irq_data(desc); if (!irqd_affinity_is_managed(d)) - irq_domain_activate_irq(d); + return irq_domain_activate_irq(d); return 0; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8bd131e4c944..e84d0e3899f6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -439,9 +439,10 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) -static inline void irq_domain_activate_irq(struct irq_data *data) +static inline int irq_domain_activate_irq(struct irq_data *data) { irqd_set_activated(data); + return 0; } static inline void irq_domain_deactivate_irq(struct irq_data *data) { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1423f8a6c75d..47e8ddd9e8cf 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1682,28 +1682,35 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); -static void __irq_domain_activate_irq(struct irq_data *irq_data) +static void __irq_domain_deactivate_irq(struct irq_data *irq_data) { if (irq_data && irq_data->domain) { struct irq_domain *domain = irq_data->domain; + if (domain->ops->deactivate) + domain->ops->deactivate(domain, irq_data); if (irq_data->parent_data) - __irq_domain_activate_irq(irq_data->parent_data); - if (domain->ops->activate) - domain->ops->activate(domain, irq_data, false); + __irq_domain_deactivate_irq(irq_data->parent_data); } } -static void __irq_domain_deactivate_irq(struct irq_data *irq_data) +static int __irq_domain_activate_irq(struct irq_data *irqd) { - if (irq_data && irq_data->domain) { - struct irq_domain *domain = irq_data->domain; + int ret = 0; - if (domain->ops->deactivate) - domain->ops->deactivate(domain, irq_data); - if (irq_data->parent_data) - __irq_domain_deactivate_irq(irq_data->parent_data); + if (irqd && irqd->domain) { + struct irq_domain *domain = irqd->domain; + + if (irqd->parent_data) + ret = __irq_domain_activate_irq(irqd->parent_data); + if (!ret && domain->ops->activate) { + ret = domain->ops->activate(domain, irqd, false); + /* Rollback in case of error */ + if (ret && irqd->parent_data) + __irq_domain_deactivate_irq(irqd->parent_data); + } } + return ret; } /** @@ -1714,12 +1721,15 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -void irq_domain_activate_irq(struct irq_data *irq_data) +int irq_domain_activate_irq(struct irq_data *irq_data) { - if (!irqd_is_activated(irq_data)) { - __irq_domain_activate_irq(irq_data); + int ret = 0; + + if (!irqd_is_activated(irq_data)) + ret = __irq_domain_activate_irq(irq_data); + if (!ret) irqd_set_activated(irq_data); - } + return ret; } /** diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6155fff9f647..5ece369950ec 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -401,11 +401,26 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct irq_data *irq_data; irq_data = irq_domain_get_irq_data(domain, desc->irq); - irq_domain_activate_irq(irq_data); + ret = irq_domain_activate_irq(irq_data); + if (ret) + goto cleanup; } } - return 0; + +cleanup: + for_each_msi_entry(desc, dev) { + struct irq_data *irqd; + + if (desc->irq == virq) + break; + + irqd = irq_domain_get_irq_data(domain, desc->irq); + if (irqd_is_activated(irqd)) + irq_domain_deactivate_irq(irqd); + } + msi_domain_free_irqs(domain, dev); + return ret; } /** -- cgit v1.2.3 From 42e1cc2dc5b698181ab1ffb7972bd880230c506e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:12 +0200 Subject: genirq/irqdomain: Propagate early activation Propagate the early activation mode to the irqdomain activate() callbacks. This is required for the upcoming reservation, late vector assignment scheme, so that the early activation call can act accordingly. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.028353660@linutronix.de --- kernel/irq/chip.c | 4 ++-- kernel/irq/internals.h | 2 +- kernel/irq/irqdomain.c | 11 ++++++----- kernel/irq/msi.c | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cd5b3eb38082..82333835ac66 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -223,7 +223,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) * Managed interrupts have reserved resources, so this should not * happen. */ - if (WARN_ON(irq_domain_activate_irq(d))) + if (WARN_ON(irq_domain_activate_irq(d, false))) return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } @@ -290,7 +290,7 @@ int irq_activate(struct irq_desc *desc) struct irq_data *d = irq_desc_get_irq_data(desc); if (!irqd_affinity_is_managed(d)) - return irq_domain_activate_irq(d); + return irq_domain_activate_irq(d, false); return 0; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e84d0e3899f6..a0327136e469 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -439,7 +439,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) -static inline int irq_domain_activate_irq(struct irq_data *data) +static inline int irq_domain_activate_irq(struct irq_data *data, bool early) { irqd_set_activated(data); return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 47e8ddd9e8cf..b50f737574ae 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1694,7 +1694,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) } } -static int __irq_domain_activate_irq(struct irq_data *irqd) +static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) { int ret = 0; @@ -1702,9 +1702,10 @@ static int __irq_domain_activate_irq(struct irq_data *irqd) struct irq_domain *domain = irqd->domain; if (irqd->parent_data) - ret = __irq_domain_activate_irq(irqd->parent_data); + ret = __irq_domain_activate_irq(irqd->parent_data, + early); if (!ret && domain->ops->activate) { - ret = domain->ops->activate(domain, irqd, false); + ret = domain->ops->activate(domain, irqd, early); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); @@ -1721,12 +1722,12 @@ static int __irq_domain_activate_irq(struct irq_data *irqd) * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -int irq_domain_activate_irq(struct irq_data *irq_data) +int irq_domain_activate_irq(struct irq_data *irq_data, bool early) { int ret = 0; if (!irqd_is_activated(irq_data)) - ret = __irq_domain_activate_irq(irq_data); + ret = __irq_domain_activate_irq(irq_data, early); if (!ret) irqd_set_activated(irq_data); return ret; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 5ece369950ec..d7553d015175 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -401,7 +401,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct irq_data *irq_data; irq_data = irq_domain_get_irq_data(domain, desc->irq); - ret = irq_domain_activate_irq(irq_data); + ret = irq_domain_activate_irq(irq_data, true); if (ret) goto cleanup; } -- cgit v1.2.3 From 22d0b12f3560d3b3264ee79faa1c05a5060fb916 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:13 +0200 Subject: genirq/irqdomain: Add force reactivation flag to irq domains Allow irqdomains to tell the core code, that after early activation the interrupt needs to be reactivated at request_irq() time. This allows reservation of vectors at early activation time and actual vector assignment at request_irq() time. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.106242536@linutronix.de --- kernel/irq/msi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index d7553d015175..edb987b2c58d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -404,6 +404,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ret = irq_domain_activate_irq(irq_data, true); if (ret) goto cleanup; + if (info->flags & MSI_FLAG_MUST_REACTIVATE) + irqd_clr_activated(irq_data); } } return 0; -- cgit v1.2.3 From 2f75d9e1c90511bff6d1ce4de94503cc28fec032 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:14 +0200 Subject: genirq: Implement bitmap matrix allocator Implement the infrastructure for a simple bitmap based allocator, which will replace the x86 vector allocator. It's in the core code as other architectures might be able to reuse/extend it. For now it only implements allocations for single CPUs, but it's simple to add multi CPU allocation support if required. The concept is rather simple: Global information: system_vector bitmap global accounting PerCPU information: allocation bitmap managed allocation bitmap local accounting The system vector bitmap is used to exclude vectors system wide from the allocation space. The allocation bitmap is used to keep track of per cpu used vectors. The managed allocation bitmap is used to reserve vectors for managed interrupts. When a regular (non managed) interrupt allocation happens then the following rule applies: tmpmap = system_map | alloc_map | managed_map find_zero_bit(tmpmap) Oring the bitmaps together gives the real available space. The same rule applies for reserving a managed interrupt vector. But contrary to the regular interrupts the reservation only marks the bit in the managed map and therefor excludes it from the regular allocations. The managed map is only cleaned out when the a managed interrupt is completely released and it stays alive accross CPU offline/online operations. For managed interrupt allocations the rule is: tmpmap = managed_map & ~alloc_map find_first_bit(tmpmap) This returns the first bit which is in the managed map, but not yet allocated in the allocation map. The allocation marks it in the allocation map and hands it back to the caller for use. The rest of the code are helper functions to handle the various requirements and the accounting which are necessary to replace the x86 vector allocation code. The result is a single patch as the evolution of this infrastructure cannot be represented in bits and pieces. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Chris Metcalf Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.185437174@linutronix.de --- kernel/irq/Kconfig | 3 + kernel/irq/Makefile | 1 + kernel/irq/matrix.c | 428 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 432 insertions(+) create mode 100644 kernel/irq/matrix.c (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index a117adf7084b..ac1a3e29d3b9 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -97,6 +97,9 @@ config HANDLE_DOMAIN_IRQ config IRQ_TIMINGS bool +config GENERIC_IRQ_MATRIX_ALLOCATOR + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1970cafe8f2a..329c9193b4bf 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -13,3 +13,4 @@ obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o +obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c new file mode 100644 index 000000000000..7b2b4fbde1e2 --- /dev/null +++ b/kernel/irq/matrix.c @@ -0,0 +1,428 @@ +/* + * Copyright (C) 2017 Thomas Gleixner + * + * SPDX-License-Identifier: GPL-2.0 + */ +#include +#include +#include +#include +#include +#include + +#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long)) + +struct cpumap { + unsigned int available; + unsigned int allocated; + unsigned int managed; + bool online; + unsigned long alloc_map[IRQ_MATRIX_SIZE]; + unsigned long managed_map[IRQ_MATRIX_SIZE]; +}; + +struct irq_matrix { + unsigned int matrix_bits; + unsigned int alloc_start; + unsigned int alloc_end; + unsigned int alloc_size; + unsigned int global_available; + unsigned int global_reserved; + unsigned int systembits_inalloc; + unsigned int total_allocated; + unsigned int online_maps; + struct cpumap __percpu *maps; + unsigned long scratch_map[IRQ_MATRIX_SIZE]; + unsigned long system_map[IRQ_MATRIX_SIZE]; +}; + +/** + * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it + * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS + * @alloc_start: From which bit the allocation search starts + * @alloc_end: At which bit the allocation search ends, i.e first + * invalid bit + */ +__init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, + unsigned int alloc_start, + unsigned int alloc_end) +{ + struct irq_matrix *m; + + if (matrix_bits > IRQ_MATRIX_BITS) + return NULL; + + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) + return NULL; + + m->matrix_bits = matrix_bits; + m->alloc_start = alloc_start; + m->alloc_end = alloc_end; + m->alloc_size = alloc_end - alloc_start; + m->maps = alloc_percpu(*m->maps); + if (!m->maps) { + kfree(m); + return NULL; + } + return m; +} + +/** + * irq_matrix_online - Bring the local CPU matrix online + * @m: Matrix pointer + */ +void irq_matrix_online(struct irq_matrix *m) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + BUG_ON(cm->online); + + bitmap_zero(cm->alloc_map, m->matrix_bits); + cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc); + cm->allocated = 0; + m->global_available += cm->available; + cm->online = true; + m->online_maps++; +} + +/** + * irq_matrix_offline - Bring the local CPU matrix offline + * @m: Matrix pointer + */ +void irq_matrix_offline(struct irq_matrix *m) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + /* Update the global available size */ + m->global_available -= cm->available; + cm->online = false; + m->online_maps--; +} + +static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm, + unsigned int num, bool managed) +{ + unsigned int area, start = m->alloc_start; + unsigned int end = m->alloc_end; + + bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end); + bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end); + area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0); + if (area >= end) + return area; + if (managed) + bitmap_set(cm->managed_map, area, num); + else + bitmap_set(cm->alloc_map, area, num); + return area; +} + +/** + * irq_matrix_assign_system - Assign system wide entry in the matrix + * @m: Matrix pointer + * @bit: Which bit to reserve + * @replace: Replace an already allocated vector with a system + * vector at the same bit position. + * + * The BUG_ON()s below are on purpose. If this goes wrong in the + * early boot process, then the chance to survive is about zero. + * If this happens when the system is life, it's not much better. + */ +void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, + bool replace) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + BUG_ON(bit > m->matrix_bits); + BUG_ON(m->online_maps > 1 || (m->online_maps && !replace)); + + set_bit(bit, m->system_map); + if (replace) { + BUG_ON(!test_and_clear_bit(bit, cm->alloc_map)); + cm->allocated--; + m->total_allocated--; + } + if (bit >= m->alloc_start && bit < m->alloc_end) + m->systembits_inalloc++; +} + +/** + * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map + * @m: Matrix pointer + * @msk: On which CPUs the bits should be reserved. + * + * Can be called for offline CPUs. Note, this will only reserve one bit + * on all CPUs in @msk, but it's not guaranteed that the bits are at the + * same offset on all CPUs + */ +int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk) +{ + unsigned int cpu, failed_cpu; + + for_each_cpu(cpu, msk) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit; + + bit = matrix_alloc_area(m, cm, 1, true); + if (bit >= m->alloc_end) + goto cleanup; + cm->managed++; + if (cm->online) { + cm->available--; + m->global_available--; + } + } + return 0; +cleanup: + failed_cpu = cpu; + for_each_cpu(cpu, msk) { + if (cpu == failed_cpu) + break; + irq_matrix_remove_managed(m, cpumask_of(cpu)); + } + return -ENOSPC; +} + +/** + * irq_matrix_remove_managed - Remove managed interrupts in a CPU map + * @m: Matrix pointer + * @msk: On which CPUs the bits should be removed + * + * Can be called for offline CPUs + * + * This removes not allocated managed interrupts from the map. It does + * not matter which one because the managed interrupts free their + * allocation when they shut down. If not, the accounting is screwed, + * but all what can be done at this point is warn about it. + */ +void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) +{ + unsigned int cpu; + + for_each_cpu(cpu, msk) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit, end = m->alloc_end; + + if (WARN_ON_ONCE(!cm->managed)) + continue; + + /* Get managed bit which are not allocated */ + bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); + + bit = find_first_bit(m->scratch_map, end); + if (WARN_ON_ONCE(bit >= end)) + continue; + + clear_bit(bit, cm->managed_map); + + cm->managed--; + if (cm->online) { + cm->available++; + m->global_available++; + } + } +} + +/** + * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map + * @m: Matrix pointer + * @cpu: On which CPU the interrupt should be allocated + */ +int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu) +{ + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit, end = m->alloc_end; + + /* Get managed bit which are not allocated */ + bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); + bit = find_first_bit(m->scratch_map, end); + if (bit >= end) + return -ENOSPC; + set_bit(bit, cm->alloc_map); + cm->allocated++; + m->total_allocated++; + return bit; +} + +/** + * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map + * @m: Matrix pointer + * @bit: Which bit to mark + * + * This should only be used to mark preallocated vectors + */ +void irq_matrix_assign(struct irq_matrix *m, unsigned int bit) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) + return; + if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map))) + return; + cm->allocated++; + m->total_allocated++; + cm->available--; + m->global_available--; +} + +/** + * irq_matrix_reserve - Reserve interrupts + * @m: Matrix pointer + * + * This is merily a book keeping call. It increments the number of globally + * reserved interrupt bits w/o actually allocating them. This allows to + * setup interrupt descriptors w/o assigning low level resources to it. + * The actual allocation happens when the interrupt gets activated. + */ +void irq_matrix_reserve(struct irq_matrix *m) +{ + if (m->global_reserved <= m->global_available && + m->global_reserved + 1 > m->global_available) + pr_warn("Interrupt reservation exceeds available resources\n"); + + m->global_reserved++; +} + +/** + * irq_matrix_remove_reserved - Remove interrupt reservation + * @m: Matrix pointer + * + * This is merily a book keeping call. It decrements the number of globally + * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the + * interrupt was never in use and a real vector allocated, which undid the + * reservation. + */ +void irq_matrix_remove_reserved(struct irq_matrix *m) +{ + m->global_reserved--; +} + +/** + * irq_matrix_alloc - Allocate a regular interrupt in a CPU map + * @m: Matrix pointer + * @msk: Which CPUs to search in + * @reserved: Allocate previously reserved interrupts + * @mapped_cpu: Pointer to store the CPU for which the irq was allocated + */ +int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, + bool reserved, unsigned int *mapped_cpu) +{ + unsigned int cpu; + + for_each_cpu(cpu, msk) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit; + + if (!cm->online) + continue; + + bit = matrix_alloc_area(m, cm, 1, false); + if (bit < m->alloc_end) { + cm->allocated++; + cm->available--; + m->total_allocated++; + m->global_available--; + if (reserved) + m->global_reserved--; + *mapped_cpu = cpu; + return bit; + } + } + return -ENOSPC; +} + +/** + * irq_matrix_free - Free allocated interrupt in the matrix + * @m: Matrix pointer + * @cpu: Which CPU map needs be updated + * @bit: The bit to remove + * @managed: If true, the interrupt is managed and not accounted + * as available. + */ +void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, + unsigned int bit, bool managed) +{ + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + + if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) + return; + + if (cm->online) { + clear_bit(bit, cm->alloc_map); + cm->allocated--; + m->total_allocated--; + if (!managed) { + cm->available++; + m->global_available++; + } + } +} + +/** + * irq_matrix_available - Get the number of globally available irqs + * @m: Pointer to the matrix to query + * @cpudown: If true, the local CPU is about to go down, adjust + * the number of available irqs accordingly + */ +unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + return m->global_available - cpudown ? cm->available : 0; +} + +/** + * irq_matrix_reserved - Get the number of globally reserved irqs + * @m: Pointer to the matrix to query + */ +unsigned int irq_matrix_reserved(struct irq_matrix *m) +{ + return m->global_reserved; +} + +/** + * irq_matrix_allocated - Get the number of allocated irqs on the local cpu + * @m: Pointer to the matrix to search + * + * This returns number of allocated irqs + */ +unsigned int irq_matrix_allocated(struct irq_matrix *m) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + return cm->allocated; +} + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +/** + * irq_matrix_debug_show - Show detailed allocation information + * @sf: Pointer to the seq_file to print to + * @m: Pointer to the matrix allocator + * @ind: Indentation for the print format + * + * Note, this is a lockless snapshot. + */ +void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind) +{ + unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits); + int cpu; + + seq_printf(sf, "Online bitmaps: %6u\n", m->online_maps); + seq_printf(sf, "Global available: %6u\n", m->global_available); + seq_printf(sf, "Global reserved: %6u\n", m->global_reserved); + seq_printf(sf, "Total allocated: %6u\n", m->total_allocated); + seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits, + m->system_map); + seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " "); + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + + seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ", + cpu, cm->available, cm->managed, cm->allocated, + m->matrix_bits, cm->alloc_map); + } + cpus_read_unlock(); +} +#endif -- cgit v1.2.3 From ec0f7cd273dc41ab28bba703cac82690ea5f2863 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:15 +0200 Subject: genirq/matrix: Add tracepoints Add tracepoints for the irq bitmap matrix allocator. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Yu Chen Acked-by: Juergen Gross Cc: Boris Ostrovsky Cc: Tony Luck Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: https://lkml.kernel.org/r/20170913213153.279468022@linutronix.de --- kernel/irq/matrix.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 7b2b4fbde1e2..a3cbbc8191c5 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -36,6 +36,9 @@ struct irq_matrix { unsigned long system_map[IRQ_MATRIX_SIZE]; }; +#define CREATE_TRACE_POINTS +#include + /** * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS @@ -84,6 +87,7 @@ void irq_matrix_online(struct irq_matrix *m) m->global_available += cm->available; cm->online = true; m->online_maps++; + trace_irq_matrix_online(m); } /** @@ -98,6 +102,7 @@ void irq_matrix_offline(struct irq_matrix *m) m->global_available -= cm->available; cm->online = false; m->online_maps--; + trace_irq_matrix_offline(m); } static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm, @@ -145,6 +150,8 @@ void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, } if (bit >= m->alloc_start && bit < m->alloc_end) m->systembits_inalloc++; + + trace_irq_matrix_assign_system(bit, m); } /** @@ -172,6 +179,7 @@ int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk) cm->available--; m->global_available--; } + trace_irq_matrix_reserve_managed(bit, cpu, m, cm); } return 0; cleanup: @@ -221,6 +229,7 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) cm->available++; m->global_available++; } + trace_irq_matrix_remove_managed(bit, cpu, m, cm); } } @@ -242,6 +251,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu) set_bit(bit, cm->alloc_map); cm->allocated++; m->total_allocated++; + trace_irq_matrix_alloc_managed(bit, cpu, m, cm); return bit; } @@ -264,6 +274,7 @@ void irq_matrix_assign(struct irq_matrix *m, unsigned int bit) m->total_allocated++; cm->available--; m->global_available--; + trace_irq_matrix_assign(bit, smp_processor_id(), m, cm); } /** @@ -282,6 +293,7 @@ void irq_matrix_reserve(struct irq_matrix *m) pr_warn("Interrupt reservation exceeds available resources\n"); m->global_reserved++; + trace_irq_matrix_reserve(m); } /** @@ -296,6 +308,7 @@ void irq_matrix_reserve(struct irq_matrix *m) void irq_matrix_remove_reserved(struct irq_matrix *m) { m->global_reserved--; + trace_irq_matrix_remove_reserved(m); } /** @@ -326,6 +339,7 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, if (reserved) m->global_reserved--; *mapped_cpu = cpu; + trace_irq_matrix_alloc(bit, cpu, m, cm); return bit; } } @@ -357,6 +371,7 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, m->global_available++; } } + trace_irq_matrix_free(bit, cpu, m, cm); } /** -- cgit v1.2.3 From 5df32107f609c1f621bcdac0a685c23677ef671e Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 28 Aug 2017 08:21:53 -0400 Subject: timekeeping: Make fast accessors return 0 before timekeeping is initialized printk timestamps will be extended to include mono and boot time by using the fast timekeeping accessors ktime_get_mono|boot_fast_ns(). The functions can return garbage before timekeeping is initialized resulting in garbage timestamps. Initialize the fast timekeepers with dummy clocks which guarantee a 0 readout up to timekeeping_init(). Suggested-by: Peter Zijlstra Signed-off-by: Prarit Bhargava Signed-off-by: Thomas Gleixner Cc: Stephen Boyd Cc: John Stultz Link: http://lkml.kernel.org/r/1503922914-10660-2-git-send-email-prarit@redhat.com --- kernel/time/timekeeping.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2cafb49aa65e..6a92794427c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -60,8 +60,27 @@ struct tk_fast { struct tk_read_base base[2]; }; -static struct tk_fast tk_fast_mono ____cacheline_aligned; -static struct tk_fast tk_fast_raw ____cacheline_aligned; +/* Suspend-time cycles value for halted fast timekeeper. */ +static u64 cycles_at_suspend; + +static u64 dummy_clock_read(struct clocksource *cs) +{ + return cycles_at_suspend; +} + +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; + +static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -477,18 +496,6 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); -/* Suspend-time cycles value for halted fast timekeeper. */ -static u64 cycles_at_suspend; - -static u64 dummy_clock_read(struct clocksource *cs) -{ - return cycles_at_suspend; -} - -static struct clocksource dummy_clock = { - .read = dummy_clock_read, -}; - /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. -- cgit v1.2.3 From 4c3711d7fb4763c63b2654f2d07cbe21ca5aadd4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Aug 2017 17:12:48 +0200 Subject: timekeeping: Provide NMI safe access to clock realtime The configurable printk timestamping wants access to clock realtime. Right now there is no ktime_get_real_fast_ns() accessor because reading the monotonic base and the realtime offset cannot be done atomically. Contrary to boot time this offset can change during runtime and cause half updated readouts. struct tk_read_base was fully packed when the fast timekeeper access was implemented. commit ceea5e3771ed ("time: Fix clock->read(clock) race around clocksource changes") removed the 'read' function pointer from the structure, but of course left the comment stale. So now the structure can fit a new 64bit member w/o violating the cache line constraints. Add real_base to tk_read_base and update it in the fast timekeeper update sequence. Implement an accessor which follows the same scheme as the accessor to clock monotonic, but uses the new real_base to access clock real time. The runtime overhead for updating real_base is minimal as it just adds two cache hot values and stores them into an already dirtied cache line along with the other fast timekeeper updates. Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: John Stultz Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/1505757060-2004-3-git-send-email-prarit@redhat.com --- kernel/time/timekeeping.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6a92794427c9..8af77006e937 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -496,6 +496,39 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + +/* + * See comment for __ktime_get_fast_ns() vs. timestamp ordering + */ +static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base_real); + + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tk_clock_read(tkr), + tkr->cycle_last, + tkr->mask)); + } while (read_seqcount_retry(&tkf->seq, seq)); + + return now; +} + +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + */ +u64 ktime_get_real_fast_ns(void) +{ + return __ktime_get_real_fast_ns(&tk_fast_mono); +} + /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. @@ -514,6 +547,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk) memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; + tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; @@ -661,6 +695,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); -- cgit v1.2.3 From 7755d83e48397e822aac751b1545f8bcf71d133e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 Sep 2017 21:20:41 +0900 Subject: irqdomain: Add __rcu annotations to radix tree accessors Fix various address spaces warning of sparse. kernel/irq/irqdomain.c:1463:14: warning: incorrect type in assignment (different address spaces) kernel/irq/irqdomain.c:1463:14: expected void **slot kernel/irq/irqdomain.c:1463:14: got void [noderef] ** kernel/irq/irqdomain.c:1465:66: warning: incorrect type in argument 2 (different address spaces) kernel/irq/irqdomain.c:1465:66: expected void [noderef] **slot kernel/irq/irqdomain.c:1465:66: got void **slot Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/1506082841-11530-1-git-send-email-yamada.masahiro@socionext.com --- kernel/irq/irqdomain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..ac4644e92b49 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private) struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void **slot; + void __rcu **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -1453,7 +1453,7 @@ out_free_desc: /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { - void **slot; + void __rcu **slot; if (d->hwirq < d->domain->revmap_size) return; /* Not using radix tree. */ -- cgit v1.2.3 From 96abb968549cdefd0964d1f7af0a79f4e6e7f897 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:16 +0200 Subject: smp/hotplug: Allow external multi-instance rollback Currently the rollback of multi-instance states is handled inside cpuhp_invoke_callback(). The problem is that when we want to allow an explicit state change for rollback, we need to return from the function without doing the rollback. Change cpuhp_invoke_callback() to optionally return the multi-instance state, such that rollback can be done from a subsequent call. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.720361181@infradead.org --- kernel/cpu.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..323b71050b54 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -123,13 +123,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked - * @step: The step in the state machine + * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked + * @node: For multi-instance, do a single entry callback for install/remove + * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, - bool bringup, struct hlist_node *node) + bool bringup, struct hlist_node *node, + struct hlist_node **lastp) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); @@ -138,6 +141,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int ret, cnt; if (!step->multi_instance) { + WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; if (!cb) return 0; @@ -152,6 +156,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* Single invocation for instance add/remove */ if (node) { + WARN_ON_ONCE(lastp && *lastp); trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); @@ -161,13 +166,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* State transition. Invoke on all instances */ cnt = 0; hlist_for_each(node, &step->list) { + if (lastp && node == *lastp) + break; + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); - if (ret) - goto err; + if (ret) { + if (!lastp) + goto err; + + *lastp = node; + return ret; + } cnt++; } + if (lastp) + *lastp = NULL; return 0; err: /* Rollback the instances if one failed */ @@ -323,7 +338,7 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -334,7 +349,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, int ret = 0; for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_down(cpu, st); @@ -350,7 +365,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); } } @@ -362,7 +377,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, while (st->state < target) { st->state++; - ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_up(cpu, st); @@ -428,11 +443,13 @@ static void cpuhp_thread_fun(unsigned int cpu) if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); local_irq_enable(); } else { ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); @@ -472,7 +489,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, bringup, node); + return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); st->cb_state = state; st->single = true; @@ -595,7 +612,7 @@ static int take_cpu_down(void *_param) st->state--; /* Invoke the former CPU_DYING callbacks */ for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -776,7 +793,7 @@ void notify_cpu_starting(unsigned int cpu) rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -1307,9 +1324,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, if (cpuhp_is_ap_state(state)) ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #endif BUG_ON(ret && !bringup); return ret; -- cgit v1.2.3 From 4dddfb5faa6118564b0c54a163353d13882299d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:17 +0200 Subject: smp/hotplug: Rewrite AP state machine core There is currently no explicit state change on rollback. That is, st->bringup, st->rollback and st->target are not consistent when doing the rollback. Rework the AP state handling to be more coherent. This does mean we have to do a second AP kick-and-wait for rollback, but since rollback is the slow path of a slowpath, this really should not matter. Take this opportunity to simplify the AP thread function to only run a single callback per invocation. This unifies the three single/up/down modes is supports. The looping it used to do for up/down are achieved by retaining should_run and relying on the main smpboot_thread_fn() loop. (I have most of a patch that does the same for the BP state handling, but that's not critical and gets a little complicated because CPUHP_BRINGUP_CPU does the AP handoff from a callback, which gets recursive @st usage, I still have de-fugly that.) [ tglx: Move cpuhp_down_callbacks() et al. into the HOTPLUG_CPU section to avoid gcc complaining about unused functions. Make the HOTPLUG_CPU one piece instead of having two consecutive ifdef sections of the same type. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.769658088@infradead.org --- kernel/cpu.c | 321 ++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 206 insertions(+), 115 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 323b71050b54..1139063de5af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,6 +58,7 @@ struct cpuhp_cpu_state { bool single; bool bringup; struct hlist_node *node; + struct hlist_node *last; enum cpuhp_state cb_state; int result; struct completion done; @@ -112,6 +113,14 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -286,7 +295,72 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); +static inline enum cpuhp_state +cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + + st->rollback = false; + st->last = NULL; + + st->target = target; + st->single = false; + st->bringup = st->state < target; + + return prev_state; +} + +static inline void +cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +{ + st->rollback = true; + + /* + * If we have st->last we need to undo partial multi_instance of this + * state first. Otherwise start undo at the previous state. + */ + if (!st->last) { + if (st->bringup) + st->state--; + else + st->state++; + } + + st->target = prev_state; + st->bringup = !st->bringup; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) +{ + if (!st->single && st->state == st->target) + return; + + st->result = 0; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); +} + +static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state; + int ret; + + prev_state = cpuhp_set_state(st, target); + __cpuhp_kick_ap(st); + if ((ret = st->result)) { + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); + } + + return ret; +} static int bringup_wait_for_ap(unsigned int cpu) { @@ -301,12 +375,10 @@ static int bringup_wait_for_ap(unsigned int cpu) stop_machine_unpark(cpu); kthread_unpark(st->thread); - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) { - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - } - return st->result; + if (st->target <= CPUHP_AP_ONLINE_IDLE) + return 0; + + return cpuhp_kick_ap(st, st->target); } static int bringup_cpu(unsigned int cpu) @@ -332,32 +404,6 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = cpuhp_get_step(st->state); - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); - } -} - -static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - enum cpuhp_state target) -{ - enum cpuhp_state prev_state = st->state; - int ret = 0; - - for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); - if (ret) { - st->target = prev_state; - undo_cpu_down(cpu, st); - break; - } - } - return ret; -} static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { @@ -404,71 +450,90 @@ static int cpuhp_should_run(unsigned int cpu) return st->should_run; } -/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ -static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - - return cpuhp_down_callbacks(cpu, st, target); -} - -/* Execute the online startup callbacks. Used to be CPU_ONLINE */ -static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - return cpuhp_up_callbacks(cpu, st, st->target); -} - /* * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke * callbacks when a state gets [un]installed at runtime. + * + * Each invocation of this function by the smpboot thread does a single AP + * state callback. + * + * It has 3 modes of operation: + * - single: runs st->cb_state + * - up: runs ++st->state, while st->state < st->target + * - down: runs st->state--, while st->state > st->target + * + * When complete or on error, should_run is cleared and the completion is fired. */ static void cpuhp_thread_fun(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - int ret = 0; + bool bringup = st->bringup; + enum cpuhp_state state; /* - * Paired with the mb() in cpuhp_kick_ap_work and - * cpuhp_invoke_ap_callback, so the work set is consistent visible. + * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures + * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (!st->should_run) - return; - st->should_run = false; + if (WARN_ON_ONCE(!st->should_run)) + return; lock_map_acquire(&cpuhp_state_lock_map); - /* Single callback invocation for [un]install ? */ + if (st->single) { - if (st->cb_state < CPUHP_AP_ONLINE) { - local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); - local_irq_enable(); + state = st->cb_state; + st->should_run = false; + } else { + if (bringup) { + st->state++; + state = st->state; + st->should_run = (st->state < st->target); + WARN_ON_ONCE(st->state > st->target); } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); + state = st->state; + st->state--; + st->should_run = (st->state > st->target); + WARN_ON_ONCE(st->state < st->target); } - } else if (st->rollback) { - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + } + + WARN_ON_ONCE(!cpuhp_is_ap_state(state)); + + if (st->rollback) { + struct cpuhp_step *step = cpuhp_get_step(state); + if (step->skip_onerr) + goto next; + } + + if (cpuhp_is_atomic_state(state)) { + local_irq_disable(); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + local_irq_enable(); - undo_cpu_down(cpu, st); - st->rollback = false; + /* + * STARTING/DYING must not fail! + */ + WARN_ON_ONCE(st->result); } else { - /* Cannot happen .... */ - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - - /* Regular hotplug work */ - if (st->state < st->target) - ret = cpuhp_ap_online(cpu, st); - else if (st->state > st->target) - ret = cpuhp_ap_offline(cpu, st); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + } + + if (st->result) { + /* + * If we fail on a rollback, we're up a creek without no + * paddle, no way forward, no way back. We loose, thanks for + * playing. + */ + WARN_ON_ONCE(st->rollback); + st->should_run = false; } + +next: lock_map_release(&cpuhp_state_lock_map); - st->result = ret; - complete(&st->done); + + if (!st->should_run) + complete(&st->done); } /* Invoke a single callback on a remote cpu */ @@ -477,6 +542,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret; if (!cpu_online(cpu)) return 0; @@ -491,48 +557,43 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!st->thread) return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + st->rollback = false; + st->last = NULL; + + st->node = node; + st->bringup = bringup; st->cb_state = state; st->single = true; - st->bringup = bringup; - st->node = node; - /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() - */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); - wait_for_completion(&st->done); - return st->result; -} + __cpuhp_kick_ap(st); -/* Regular hotplug invocation of the AP hotplug thread */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) -{ - st->result = 0; - st->single = false; /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() + * If we failed and did a partial, do a rollback. */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); + if ((ret = st->result) && st->last) { + st->rollback = true; + st->bringup = !bringup; + + __cpuhp_kick_ap(st); + } + + return ret; } static int cpuhp_kick_ap_work(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - enum cpuhp_state state = st->state; + enum cpuhp_state prev_state = st->state; + int ret; - trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); lock_map_acquire(&cpuhp_state_lock_map); lock_map_release(&cpuhp_state_lock_map); - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - trace_cpuhp_exit(cpu, st->state, state, st->result); - return st->result; + + trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); + ret = cpuhp_kick_ap(st, st->target); + trace_cpuhp_exit(cpu, st->state, prev_state, ret); + + return ret; } static struct smp_hotplug_thread cpuhp_threads = { @@ -693,11 +754,32 @@ void cpuhp_report_idle_dead(void) cpuhp_complete_idle_dead, st, 0); } -#else -#define takedown_cpu NULL -#endif +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = cpuhp_get_step(st->state); -#ifdef CONFIG_HOTPLUG_CPU + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st); + break; + } + } + return ret; +} /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, @@ -716,13 +798,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_tasks_frozen = tasks_frozen; - prev_state = st->state; - st->target = target; + prev_state = cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ if (st->state > CPUHP_TEARDOWN_CPU) { + st->target = max((int)target, CPUHP_TEARDOWN_CPU); ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -737,6 +819,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ if (st->state > CPUHP_TEARDOWN_CPU) goto out; + + st->target = target; } /* * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need @@ -744,9 +828,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { - st->target = prev_state; - st->rollback = true; - cpuhp_kick_ap_work(cpu); + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); } out: @@ -771,11 +854,15 @@ out: cpu_maps_update_done(); return err; } + int cpu_down(unsigned int cpu) { return do_cpu_down(cpu, CPUHP_OFFLINE); } EXPORT_SYMBOL(cpu_down); + +#else +#define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ /** @@ -846,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - st->target = target; + cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. @@ -1313,6 +1400,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct cpuhp_step *sp = cpuhp_get_step(state); int ret; + /* + * If there's nothing to do, we done. + * Relies on the union for multi_instance. + */ if ((bringup && !sp->startup.single) || (!bringup && !sp->teardown.single)) return 0; -- cgit v1.2.3 From 724a86881d03ee5794148e65142e24ed3621be66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:18 +0200 Subject: smp/hotplug: Callback vs state-machine consistency While the generic callback functions have an 'int' return and thus appear to be allowed to return error, this is not true for all states. Specifically, what used to be STARTING/DYING are ran with IRQs disabled from critical parts of CPU bringup/teardown and are not allowed to fail. Add WARNs to enforce this rule. But since some callbacks are indeed allowed to fail, we have the situation where a state-machine rollback encounters a failure, in this case we're stuck, we can't go forward and we can't go back. Also add a WARN for that case. AFAICT this is a fundamental 'problem' with no real obvious solution. We want the 'prepare' callbacks to allow failure on either up or down. Typically on prepare-up this would be things like -ENOMEM from resource allocations, and the typical usage in prepare-down would be something like -EBUSY to avoid CPUs being taken away. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.819539119@infradead.org --- kernel/cpu.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 1139063de5af..d6f1b8c36400 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -202,7 +202,14 @@ err: hlist_for_each(node, &step->list) { if (!cnt--) break; - cbm(cpu, node); + + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + /* + * Rollback must not fail, + */ + WARN_ON_ONCE(ret); } return ret; } @@ -659,6 +666,7 @@ static int take_cpu_down(void *_param) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); + int ret; /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); @@ -672,8 +680,13 @@ static int take_cpu_down(void *_param) WARN_ON(st->state != CPUHP_TEARDOWN_CPU); st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + /* + * DYING must not fail! + */ + WARN_ON_ONCE(ret); + } /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -876,11 +889,16 @@ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + /* + * STARTING must not fail! + */ + WARN_ON_ONCE(ret); } } -- cgit v1.2.3 From 5f4b55e10645b7371322c800a5ec745cab487a6c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:20 +0200 Subject: smp/hotplug: Differentiate the AP-work lockdep class between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. CPU0 CPU1 CPU2 cpuhp_up_callbacks: takedown_cpu: cpuhp_thread_fun: cpuhp_state irq_lock_sparse() irq_lock_sparse() wait_for_completion() cpuhp_state complete() Now that we have consistent AP state, we can trivially separate the AP-work class between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.922524234@infradead.org --- kernel/cpu.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d6f1b8c36400..d5c09985fbb6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -68,9 +68,26 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) -static struct lock_class_key cpuhp_state_key; -static struct lockdep_map cpuhp_state_lock_map = - STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +static struct lockdep_map cpuhp_state_up_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); +static struct lockdep_map cpuhp_state_down_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); + + +static void inline cpuhp_lock_acquire(bool bringup) +{ + lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} + +static void inline cpuhp_lock_release(bool bringup) +{ + lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} +#else + +static void inline cpuhp_lock_acquire(bool bringup) { } +static void inline cpuhp_lock_release(bool bringup) { } + #endif /** @@ -486,7 +503,7 @@ static void cpuhp_thread_fun(unsigned int cpu) if (WARN_ON_ONCE(!st->should_run)) return; - lock_map_acquire(&cpuhp_state_lock_map); + cpuhp_lock_acquire(bringup); if (st->single) { state = st->cb_state; @@ -537,7 +554,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } next: - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_release(bringup); if (!st->should_run) complete(&st->done); @@ -554,8 +571,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); /* * If we are up and running, use the hotplug thread. For early calls @@ -593,8 +613,11 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state prev_state = st->state; int ret; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); ret = cpuhp_kick_ap(st, st->target); -- cgit v1.2.3 From 5ebe7742fff8be5f1359bc50f5d43fb6ff7bd060 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:19 +0200 Subject: smp/hotplug: Differentiate the AP completion between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. takedown_cpu() irq_lock_sparse() wait_for_completion(&st->done) cpuhp_thread_fun cpuhp_up_callback cpuhp_invoke_callback irq_affinity_online_cpu irq_local_spare() irq_unlock_sparse() complete(&st->done) Now that we have consistent AP state, we can trivially separate the AP completion between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.872472799@infradead.org --- kernel/cpu.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d5c09985fbb6..6bbe261b851f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -46,7 +46,8 @@ * @bringup: Single callback bringup or teardown selector * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation - * @done: Signal completion to the issuer of the task + * @done_up: Signal completion to the issuer of the task for cpu-up + * @done_down: Signal completion to the issuer of the task for cpu-down */ struct cpuhp_cpu_state { enum cpuhp_state state; @@ -61,7 +62,8 @@ struct cpuhp_cpu_state { struct hlist_node *last; enum cpuhp_state cb_state; int result; - struct completion done; + struct completion done_up; + struct completion done_down; #endif }; @@ -130,14 +132,6 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } -/* - * The former STARTING/DYING states, ran with IRQs disabled and must not fail. - */ -static bool cpuhp_is_atomic_state(enum cpuhp_state state) -{ - return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; -} - static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -232,6 +226,26 @@ err: } #ifdef CONFIG_SMP +static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + wait_for_completion(done); +} + +static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + complete(done); +} + +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; @@ -368,7 +382,7 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) smp_mb(); st->should_run = true; wake_up_process(st->thread); - wait_for_completion(&st->done); + wait_for_ap_thread(st, st->bringup); } static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) @@ -391,7 +405,7 @@ static int bringup_wait_for_ap(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, true); if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; @@ -464,7 +478,8 @@ static void cpuhp_create(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - init_completion(&st->done); + init_completion(&st->done_up); + init_completion(&st->done_down); } static int cpuhp_should_run(unsigned int cpu) @@ -557,7 +572,7 @@ next: cpuhp_lock_release(bringup); if (!st->should_run) - complete(&st->done); + complete_ap_thread(st, bringup); } /* Invoke a single callback on a remote cpu */ @@ -753,7 +768,7 @@ static int takedown_cpu(unsigned int cpu) * * Wait for the stop thread to go away. */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ @@ -772,7 +787,7 @@ static void cpuhp_complete_idle_dead(void *arg) { struct cpuhp_cpu_state *st = arg; - complete(&st->done); + complete_ap_thread(st, false); } void cpuhp_report_idle_dead(void) @@ -939,7 +954,7 @@ void cpuhp_online_idle(enum cpuhp_state state) return; st->state = CPUHP_AP_ONLINE_IDLE; - complete(&st->done); + complete_ap_thread(st, true); } /* Requires cpu_add_remove_lock to be held */ -- cgit v1.2.3 From 1db49484f21ed0fcdadd0635a3669f5f386546fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:21 +0200 Subject: smp/hotplug: Hotplug state fail injection Add a sysfs file to one-time fail a specific state. This can be used to test the state rollback code paths. Something like this (hotplug-up.sh): #!/bin/bash echo 0 > /debug/sched_debug echo 1 > /debug/tracing/events/cpuhp/enable ALL_STATES=`cat /sys/devices/system/cpu/hotplug/states | cut -d':' -f1` STATES=${1:-$ALL_STATES} for state in $STATES do echo 0 > /sys/devices/system/cpu/cpu1/online echo 0 > /debug/tracing/trace echo Fail state: $state echo $state > /sys/devices/system/cpu/cpu1/hotplug/fail cat /sys/devices/system/cpu/cpu1/hotplug/fail echo 1 > /sys/devices/system/cpu/cpu1/online cat /debug/tracing/trace > hotfail-${state}.trace sleep 1 done Can be used to test for all possible rollback (barring multi-instance) scenarios on CPU-up, CPU-down is a trivial modification of the above. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.972581715@infradead.org --- kernel/cpu.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6bbe261b851f..8de11a29e495 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -52,6 +52,7 @@ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; + enum cpuhp_state fail; #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; @@ -67,7 +68,9 @@ struct cpuhp_cpu_state { #endif }; -static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { + .fail = CPUHP_INVALID, +}; #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = @@ -160,6 +163,15 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; + if (st->fail == state) { + st->fail = CPUHP_INVALID; + + if (!(bringup ? step->startup.single : step->teardown.single)) + return 0; + + return -EAGAIN; + } + if (!step->multi_instance) { WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; @@ -1805,9 +1817,55 @@ static ssize_t show_cpuhp_target(struct device *dev, } static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static ssize_t write_cpuhp_fail(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int fail, ret; + + ret = kstrtoint(buf, 10, &fail); + if (ret) + return ret; + + /* + * Cannot fail STARTING/DYING callbacks. + */ + if (cpuhp_is_atomic_state(fail)) + return -EINVAL; + + /* + * Cannot fail anything that doesn't have callbacks. + */ + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(fail); + if (!sp->startup.single && !sp->teardown.single) + ret = -EINVAL; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + st->fail = fail; + + return count; +} + +static ssize_t show_cpuhp_fail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->fail); +} + +static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); + static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, &dev_attr_target.attr, + &dev_attr_fail.attr, NULL }; -- cgit v1.2.3 From 38683148828165ea0b66ace93a9fedc2d3281e27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 13:50:20 -0700 Subject: cgroup: statically initialize init_css_set->dfl_cgrp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like other csets, init_css_set's dfl_cgrp is initialized when the cset gets linked. init_css_set gets linked in cgroup_init(). This has been fine till now but the recently added basic CPU usage accounting may end up accessing dfl_cgrp of init before cgroup_init() leading to the following oops. SELinux: Initializing. BUG: unable to handle kernel NULL pointer dereference at 00000000000000b0 IP: account_system_index_time+0x60/0x90 PGD 0 P4D 0 Oops: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc2-00003-g041cd64 #10 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS +1.9.3-20161025_171302-gandalf 04/01/2014 task: ffffffff81e10480 task.stack: ffffffff81e00000 RIP: 0010:account_system_index_time+0x60/0x90 RSP: 0000:ffff880011e03cb8 EFLAGS: 00010002 RAX: ffffffff81ef8800 RBX: ffffffff81e10480 RCX: 0000000000000003 RDX: 0000000000000000 RSI: 00000000000f4240 RDI: 0000000000000000 RBP: ffff880011e03cc0 R08: 0000000000010000 R09: 0000000000000000 R10: 0000000000000020 R11: 0000003b9aca0000 R12: 000000000001c100 R13: 0000000000000000 R14: ffffffff81e10480 R15: ffffffff81e03cd8 FS: 0000000000000000(0000) GS:ffff880011e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000b0 CR3: 0000000001e09000 CR4: 00000000000006b0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: account_system_time+0x45/0x60 account_process_tick+0x5a/0x140 update_process_times+0x22/0x60 tick_periodic+0x2b/0x90 tick_handle_periodic+0x25/0x70 timer_interrupt+0x15/0x20 __handle_irq_event_percpu+0x7e/0x1b0 handle_irq_event_percpu+0x23/0x60 handle_irq_event+0x42/0x70 handle_level_irq+0x83/0x100 handle_irq+0x6f/0x110 do_IRQ+0x46/0xd0 common_interrupt+0x9d/0x9d Fix it by statically initializing init_css_set.dfl_cgrp so that init's default cgroup is accessible from the get-go. Fixes: 041cd640b2f3 ("cgroup: Implement cgroup2 basic CPU usage accounting") Reported-by: “kbuild-all@01.org” Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d036625556c9..7975b20f1fd1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -649,6 +649,14 @@ struct css_set init_css_set = { .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), + + /* + * The following field is re-initialized when this cset gets linked + * in cgroup_init(). However, let's initialize the field + * statically too so that the default cgroup can be accessed safely + * early during boot. + */ + .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ -- cgit v1.2.3 From 8157a7faf94135386bf04b1cf94e6efd3fb94702 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 14:27:54 -0700 Subject: sched/cputime: Add dummy cputime_adjust() implementation for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cfb766da54d9 ("sched/cputime: Expose cputime_adjust()") made cputime_adjust() public for cgroup basic cpu stat support; however, the commit forgot to add a dummy implementaiton for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE leading to compiler errors on some s390 configurations. Fix it by adding the missing dummy implementation. Reported-by: “kbuild-all@01.org” Fixes: cfb766da54d9 ("sched/cputime: Expose cputime_adjust()") Signed-off-by: Tejun Heo --- kernel/sched/cputime.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e01b699bbd5b..5498f20d2475 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -447,6 +447,13 @@ void vtime_account_irq_enter(struct task_struct *tsk) EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ +void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) +{ + *ut = curr->utime; + *st = curr->stime; +} + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) { *ut = p->utime; -- cgit v1.2.3 From b5d7388f9db78f19e6af7b56a469ca8d1860329d Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Thu, 21 Sep 2017 18:43:29 -0400 Subject: bpf: Optimize lpm trie delete Before the delete operator was added, this datastructure maintained an invariant that intermediate nodes were only present when necessary to build the tree. This patch updates the delete operation to reinstate that invariant by removing unnecessary intermediate nodes after a node is removed and thus keeping the tree structure at a minimal size. Suggested-by: Daniel Mack Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 71 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 9d58a576b2ae..34d8a690ea05 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -394,8 +394,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key; - struct lpm_trie_node __rcu **trim; - struct lpm_trie_node *node; + struct lpm_trie_node __rcu **trim, **trim2; + struct lpm_trie_node *node, *parent; unsigned long irq_flags; unsigned int next_bit; size_t matchlen = 0; @@ -407,31 +407,26 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) raw_spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping - * track of where we could begin trimming the tree. The trim-point - * is the sub-tree along the walk consisting of only single-child - * intermediate nodes and ending at a leaf node that we want to - * remove. + * track of the path we traverse. We will need to know the node + * we wish to delete, and the slot that points to the node we want + * to delete. We may also need to know the nodes parent and the + * slot that contains it. */ trim = &trie->root; - node = rcu_dereference_protected( - trie->root, lockdep_is_held(&trie->lock)); - while (node) { + trim2 = trim; + parent = NULL; + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || node->prefixlen == key->prefixlen) break; + parent = node; + trim2 = trim; next_bit = extract_bit(key->data, node->prefixlen); - /* If we hit a node that has more than one child or is a valid - * prefix itself, do not remove it. Reset the root of the trim - * path to its descendant on our path. - */ - if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || - (node->child[0] && node->child[1])) - trim = &node->child[next_bit]; - node = rcu_dereference_protected( - node->child[next_bit], lockdep_is_held(&trie->lock)); + trim = &node->child[next_bit]; } if (!node || node->prefixlen != key->prefixlen || @@ -442,27 +437,47 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) trie->n_entries--; - /* If the node we are removing is not a leaf node, simply mark it + /* If the node we are removing has two children, simply mark it * as intermediate and we are done. */ - if (rcu_access_pointer(node->child[0]) || + if (rcu_access_pointer(node->child[0]) && rcu_access_pointer(node->child[1])) { node->flags |= LPM_TREE_NODE_FLAG_IM; goto out; } - /* trim should now point to the slot holding the start of a path from - * zero or more intermediate nodes to our leaf node for deletion. + /* If the parent of the node we are about to delete is an intermediate + * node, and the deleted node doesn't have any children, we can delete + * the intermediate parent as well and promote its other child + * up the tree. Doing this maintains the invariant that all + * intermediate nodes have exactly 2 children and that there are no + * unnecessary intermediate nodes in the tree. */ - while ((node = rcu_dereference_protected( - *trim, lockdep_is_held(&trie->lock)))) { - RCU_INIT_POINTER(*trim, NULL); - trim = rcu_access_pointer(node->child[0]) ? - &node->child[0] : - &node->child[1]; + if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) && + !node->child[0] && !node->child[1]) { + if (node == rcu_access_pointer(parent->child[0])) + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[1])); + else + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[0])); + kfree_rcu(parent, rcu); kfree_rcu(node, rcu); + goto out; } + /* The node we are removing has either zero or one child. If there + * is a child, move it into the removed node's slot then delete + * the node. Otherwise just clear the slot and delete the node. + */ + if (node->child[0]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0])); + else if (node->child[1]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); + else + RCU_INIT_POINTER(*trim, NULL); + kfree_rcu(node, rcu); + out: raw_spin_unlock_irqrestore(&trie->lock, irq_flags); -- cgit v1.2.3 From 05e3db95ebfc5c06a29a1d8c7a3e02f46f3a25a7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:04 -0700 Subject: kthread: add a mechanism to store cgroup info kthread usually runs jobs on behalf of other threads. The jobs should be charged to cgroup of original threads. But the jobs run in a kthread, where we lose the cgroup context of original threads. The patch adds a machanism to record cgroup info of original threads in kthread context. Later we can retrieve the cgroup info and attach the cgroup info to jobs. Since this mechanism is only required by kthread, we store the cgroup info in kthread data instead of generic task_struct. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/kthread.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c19edf82427..b011ea08967f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,7 +20,6 @@ #include #include #include -#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -47,6 +46,9 @@ struct kthread { void *data; struct completion parked; struct completion exited; +#ifdef CONFIG_CGROUPS + struct cgroup_subsys_state *blkcg_css; +#endif }; enum KTHREAD_BITS { @@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k) void free_kthread_struct(struct task_struct *k) { + struct kthread *kthread; + /* * Can be NULL if this kthread was created by kernel_thread() * or if kmalloc() in kthread() failed. */ - kfree(to_kthread(k)); + kthread = to_kthread(k); +#ifdef CONFIG_CGROUPS + WARN_ON_ONCE(kthread && kthread->blkcg_css); +#endif + kfree(kthread); } /** @@ -216,6 +224,9 @@ static int kthread(void *_create) self->data = data; init_completion(&self->exited); init_completion(&self->parked); +#ifdef CONFIG_CGROUPS + self->blkcg_css = NULL; +#endif current->vfork_done = &self->exited; /* OK, tell user we're spawned, wait for stop or wakeup */ @@ -1154,3 +1165,54 @@ void kthread_destroy_worker(struct kthread_worker *worker) kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); + +#ifdef CONFIG_CGROUPS +/** + * kthread_associate_blkcg - associate blkcg to current kthread + * @css: the cgroup info + * + * Current thread must be a kthread. The thread is running jobs on behalf of + * other threads. In some cases, we expect the jobs attach cgroup info of + * original threads instead of that of current thread. This function stores + * original thread's cgroup info in current kthread context for later + * retrieval. + */ +void kthread_associate_blkcg(struct cgroup_subsys_state *css) +{ + struct kthread *kthread; + + if (!(current->flags & PF_KTHREAD)) + return; + kthread = to_kthread(current); + if (!kthread) + return; + + if (kthread->blkcg_css) { + css_put(kthread->blkcg_css); + kthread->blkcg_css = NULL; + } + if (css) { + css_get(css); + kthread->blkcg_css = css; + } +} +EXPORT_SYMBOL(kthread_associate_blkcg); + +/** + * kthread_blkcg - get associated blkcg css of current kthread + * + * Current thread must be a kthread. + */ +struct cgroup_subsys_state *kthread_blkcg(void) +{ + struct kthread *kthread; + + if (current->flags & PF_KTHREAD) { + kthread = to_kthread(current); + if (kthread) + return kthread->blkcg_css; + } + return NULL; +} +EXPORT_SYMBOL(kthread_blkcg); +#endif -- cgit v1.2.3 From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Sep 2017 11:02:12 -0700 Subject: block: fix a build error The code is only for blkcg not for all cgroups Fixes: d4478e92d618 ("block/loop: make loop cgroup aware") Reported-by: kbuild test robot Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/kthread.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index b011ea08967f..f87cd8b4eb2a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -46,7 +46,7 @@ struct kthread { void *data; struct completion parked; struct completion exited; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif }; @@ -83,7 +83,7 @@ void free_kthread_struct(struct task_struct *k) * or if kmalloc() in kthread() failed. */ kthread = to_kthread(k); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread && kthread->blkcg_css); #endif kfree(kthread); @@ -224,7 +224,7 @@ static int kthread(void *_create) self->data = data; init_completion(&self->exited); init_completion(&self->parked); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP self->blkcg_css = NULL; #endif current->vfork_done = &self->exited; @@ -1166,7 +1166,7 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info -- cgit v1.2.3 From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:50 +0200 Subject: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers Just do the rename into bpf_compute_data_pointers() as we'll add one more pointer here to recompute. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..a298d6666698 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) skb_orphan(skb); skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; @@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); -- cgit v1.2.3 From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:51 +0200 Subject: bpf: add meta pointer for direct access This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 114 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 86 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..f849eca36052 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...) va_end(args); } +static bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -187,6 +193,7 @@ static const char * const reg_type_str[] = { [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", [PTR_TO_STACK] = "fp", [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", }; @@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state) verbose("(id=%d", reg->id); if (t != SCALAR_VALUE) verbose(",off=%d", reg->off); - if (t == PTR_TO_PACKET) + if (type_is_pkt_pointer(t)) verbose(",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || @@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) __mark_reg_known_zero(regs + regno); } +static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +{ + return type_is_pkt_pointer(reg->type); +} + +static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) +{ + return reg_is_pkt_pointer(reg) || + reg->type == PTR_TO_PACKET_END; +} + +/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ +static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, + enum bpf_reg_type which) +{ + /* The register can already have a range from prior markings. + * This is fine as long as it hasn't been advanced from its + * origin. + */ + return reg->type == which && + reg->id == 0 && + reg->off == 0 && + tnum_equals_const(reg->var_off, 0); +} + /* Attempts to improve min/max values based on var_off information */ static void __update_reg_bounds(struct bpf_reg_state *reg) { @@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: + case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: case CONST_PTR_TO_MAP: return true; @@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, switch (reg->type) { case PTR_TO_PACKET: - /* special case, because of NET_IP_ALIGN */ + case PTR_TO_PACKET_META: + /* Special case, because of NET_IP_ALIGN. Given metadata sits + * right in front, treat it the very same way. + */ return check_pkt_ptr_alignment(reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; @@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a - * PTR_TO_PACKET[_END]. In the latter case, we know - * the offset is zero. + * PTR_TO_PACKET[_META,_END]. In the latter + * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) mark_reg_unknown(state->regs, value_regno); @@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { err = check_stack_read(state, off, size, value_regno); } - } else if (reg->type == PTR_TO_PACKET) { + } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; @@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: + case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size); @@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && + if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; @@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (type != PTR_TO_PACKET && type != expected_type) + if (!type_is_pkt_pointer(type) && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (register_is_null(*reg)) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; @@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->key_size); else @@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->value_size); else @@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, - * so turn them into unknown SCALAR_VALUE. +/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] + * are now invalid, so turn them into unknown SCALAR_VALUE. */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { @@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET || - regs[i].type == PTR_TO_PACKET_END) + if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(regs, i); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type != PTR_TO_PACKET && - reg->type != PTR_TO_PACKET_END) - continue; - __mark_reg_unknown(reg); + if (reg_is_pkt_pointer_any(reg)) + __mark_reg_unknown(reg); } } @@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ dst_reg->range = 0; @@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } static void find_good_pkt_pointers(struct bpf_verifier_state *state, - struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type) { struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + if (regs[i].type == type && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ regs[i].range = max_t(u16, regs[i].range, dst_reg->off); @@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + if (reg->type == type && reg->id == dst_reg->id) reg->range = max_t(u16, reg->range, dst_reg->off); } } @@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(this_branch, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(other_branch, dst_reg); + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; @@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; /* Check our ids match any regs they're supposed to */ return check_ids(rold->id, rcur->id, idmap); + case PTR_TO_PACKET_META: case PTR_TO_PACKET: - if (rcur->type != PTR_TO_PACKET) + if (rcur->type != rold->type) return false; /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are -- cgit v1.2.3 From 66a733ea6b611aecf0119514d2dddab5f9d6c01e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Sep 2017 09:25:30 -0600 Subject: seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter() As Chris explains, get_seccomp_filter() and put_seccomp_filter() can end up using different filters. Once we drop ->siglock it is possible for task->seccomp.filter to have been replaced by SECCOMP_FILTER_FLAG_TSYNC. Fixes: f8e529ed941b ("seccomp, ptrace: add support for dumping seccomp filters") Reported-by: Chris Salls Cc: stable@vger.kernel.org # needs s/refcount_/atomic_/ for v4.12 and earlier Signed-off-by: Oleg Nesterov [tycho: add __get_seccomp_filter vs. open coding refcount_inc()] Signed-off-by: Tycho Andersen [kees: tweak commit log] Signed-off-by: Kees Cook --- kernel/seccomp.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index c24579dfa7a1..bb3a38005b9c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -473,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } +void __get_seccomp_filter(struct seccomp_filter *filter) +{ + /* Reference count is bounded by the number of total processes. */ + refcount_inc(&filter->usage); +} + /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; - /* Reference count is bounded by the number of total processes. */ - refcount_inc(&orig->usage); + __get_seccomp_filter(orig); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -491,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig) { - struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ while (orig && refcount_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; @@ -503,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk) } } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + __put_seccomp_filter(tsk->seccomp.filter); +} + static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) { memset(info, 0, sizeof(*info)); @@ -1025,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - get_seccomp_filter(task); + __get_seccomp_filter(filter); spin_unlock_irq(&task->sighand->siglock); if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - put_seccomp_filter(task); + __put_seccomp_filter(filter); return ret; out: -- cgit v1.2.3 From 63fef14fc98a8b4fad777fd3bef4d068802b3f14 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 18 Aug 2017 17:24:00 +0900 Subject: kprobes/x86: Make insn buffer always ROX and use text_poke() Make insn buffer always ROX and use text_poke() to write the copied instructions instead of set_memory_*(). This makes instruction buffer stronger against other kernel subsystems because there is no window time to modify the buffer. Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150304463032.17009.14195368040691676813.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a1606a4224e1..15fba7fe57c8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -117,7 +117,7 @@ enum kprobe_slot_state { SLOT_USED = 2, }; -static void *alloc_insn_page(void) +void __weak *alloc_insn_page(void) { return module_alloc(PAGE_SIZE); } -- cgit v1.2.3 From 3539d09154e11336c31a900a9cd49e386ba6d9b2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 19 Sep 2017 18:59:00 +0900 Subject: kprobes: Improve smoke test to check preemptibility Add preemptible check to each handler. Handlers are called with non-preemtible, which is guaranteed by Documentation/kprobes.txt. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150581513991.32348.7956810394499654272.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/test_kprobes.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 0dbab6d1acb4..47106a1e645a 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -34,6 +34,10 @@ static noinline u32 kprobe_target(u32 value) static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) { + if (preemptible()) { + handler_errors++; + pr_err("pre-handler is preemptible\n"); + } preh_val = (rand1 / div_factor); return 0; } @@ -41,6 +45,10 @@ static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { + if (preemptible()) { + handler_errors++; + pr_err("post-handler is preemptible\n"); + } if (preh_val != (rand1 / div_factor)) { handler_errors++; pr_err("incorrect value in post_handler\n"); @@ -156,6 +164,10 @@ static int test_kprobes(void) static u32 j_kprobe_target(u32 value) { + if (preemptible()) { + handler_errors++; + pr_err("jprobe-handler is preemptible\n"); + } if (value != rand1) { handler_errors++; pr_err("incorrect value in jprobe handler\n"); @@ -232,6 +244,10 @@ static u32 krph_val; static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { + if (preemptible()) { + handler_errors++; + pr_err("kretprobe entry handler is preemptible\n"); + } krph_val = (rand1 / div_factor); return 0; } @@ -240,6 +256,10 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long ret = regs_return_value(regs); + if (preemptible()) { + handler_errors++; + pr_err("kretprobe return handler is preemptible\n"); + } if (ret != (rand1 / div_factor)) { handler_errors++; pr_err("incorrect value in kretprobe handler\n"); -- cgit v1.2.3 From e863d5396146411b615231cae0c518cb2a23371c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 19 Sep 2017 19:00:19 +0900 Subject: kprobes: Warn if optprobe handler tries to change execution path Warn if optprobe handler tries to change execution path. As described in Documentation/kprobes.txt, with optprobe user handler can not change instruction pointer. In that case user must avoid optimizing the kprobes by setting post_handler or break_handler. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150581521955.32348.3615624715034787365.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 15fba7fe57c8..2d28377a0e32 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -387,7 +387,10 @@ void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) list_for_each_entry_rcu(kp, &p->list, list) { if (kp->pre_handler && likely(!kprobe_disabled(kp))) { set_kprobe_instance(kp); - kp->pre_handler(kp, regs); + if (kp->pre_handler(kp, regs)) { + if (WARN_ON_ONCE(1)) + pr_err("Optprobe ignores instruction pointer changing.(%pF)\n", p->addr); + } } reset_kprobe_instance(); } -- cgit v1.2.3 From 72364d320644c12948786962673772f271039a4a Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Thu, 28 Sep 2017 12:37:31 +0800 Subject: irq/generic-chip: Don't replace domain's name When generic irq chips are allocated for an irq domain the domain name is set to the irq chip name. That was done to have named domains before the recent changes which enforce domain naming were done. Since then the overwrite causes a memory leak when the domain name is dynamically allocated and even worse it would cause the domain free code to free the wrong name pointer, which might point to a constant. Remove the name assignment to prevent this. Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only") Signed-off-by: Jeffy Chen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170928043731.4764-1-jeffy.chen@rock-chips.com --- kernel/irq/generic-chip.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f7086b78ad6e..5270a54b9fa4 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } - d->name = name; return 0; } EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); -- cgit v1.2.3 From 77c01d11bbb2b5c005347061bf543ab94878314c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 26 Sep 2017 10:36:03 +0100 Subject: watchdog/hardlockup/perf: Fix spelling mistake: "permanetely" -> "permanently" Trivial fix to spelling mistake in pr_info message Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Acked-by: Don Zickus Cc: Andrew Morton Cc: Babu Moger Link: https://lkml.kernel.org/r/20170926093603.7756-1-colin.king@canonical.com --- kernel/watchdog_hld.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 204a8cadb717..71a62ceacdc8 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -280,7 +280,7 @@ int __init hardlockup_detector_perf_init(void) int ret = hardlockup_detector_event_create(); if (ret) { - pr_info("Perf NMI watchdog permanetely disabled\n"); + pr_info("Perf NMI watchdog permanently disabled\n"); } else { perf_event_release_kernel(this_cpu_read(watchdog_ev)); this_cpu_write(watchdog_ev, NULL); -- cgit v1.2.3 From 2b7c6ba945fd3c10ca3e030be402098aff2f89d3 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 26 Sep 2017 16:35:13 +0100 Subject: bpf/verifier: improve disassembly of BPF_END instructions print_bpf_insn() was treating all BPF_ALU[64] the same, but BPF_END has a different structure: it has a size in insn->imm (even if it's BPF_X) and uses the BPF_SRC (X or K) to indicate which endianness to use. So it needs different code to print it. Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f849eca36052..e8d7bb8e6b98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -332,26 +332,40 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; +static void print_bpf_end_insn(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + static void print_bpf_insn(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_SRC(insn->code) == BPF_X) + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose("BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(env, insn); + } else if (BPF_SRC(insn->code) == BPF_X) { verbose("(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); - else + } else { verbose("(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->imm); + } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", -- cgit v1.2.3 From 73c864b38383f4abc9b559025cd663f4a81afa89 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 26 Sep 2017 16:35:29 +0100 Subject: bpf/verifier: improve disassembly of BPF_NEG instructions BPF_NEG takes only one operand, unlike the bulk of BPF_ALU[64] which are compound-assignments. So give it its own format in print_bpf_insn(). Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e8d7bb8e6b98..4cf9b72c59a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -351,6 +351,11 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, verbose("BUG_alu64_%02x\n", insn->code); else print_bpf_end_insn(env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose("(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose("(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", -- cgit v1.2.3 From 87cbde8d9081b91df86a21d0d743cd700e04890a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Sep 2017 01:45:10 +0200 Subject: PM / s2idle: Invoke the ->wake() platform callback earlier The role of the ->wake() platform callback for suspend-to-idle is to deal with possible spurious wakeups, among other things. The ACPI implementation of it, acpi_s2idle_wake(), additionally checks the conditions for entering the Low Power S0 Idle state by the platform and reports the ones that have not been met. However, the ->wake() platform callback is invoked after calling dpm_noirq_resume_devices(), which means that the power states of some devices may have changed since s2idle_enter() returned, so some unmet Low Power S0 Idle conditions may be reported incorrectly as a result of that. To avoid these false positives, reorder the invocations of the dpm_noirq_resume_devices() routine and the ->wake() platform callback in s2idle_loop(). Fixes: 726fb6b4f2a8 (ACPI / PM: Check low power idle constraints for debug only) Tested-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 3e2b4f519009..ccd2d20e6b06 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -120,22 +120,26 @@ static void s2idle_loop(void) * frozen processes + suspended devices + idle processors. * Thus s2idle_enter() should be called right after * all devices have been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, + * so prevent them from terminating the loop right away. */ error = dpm_noirq_suspend_devices(PMSG_SUSPEND); if (!error) s2idle_enter(); + else if (error == -EBUSY && pm_wakeup_pending()) + error = 0; - dpm_noirq_resume_devices(PMSG_RESUME); - if (error && (error != -EBUSY || !pm_wakeup_pending())) { - dpm_noirq_end(); - break; - } - - if (s2idle_ops && s2idle_ops->wake) + if (!error && s2idle_ops && s2idle_ops->wake) s2idle_ops->wake(); + dpm_noirq_resume_devices(PMSG_RESUME); + dpm_noirq_end(); + if (error) + break; + if (s2idle_ops && s2idle_ops->sync) s2idle_ops->sync(); -- cgit v1.2.3 From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:52 -0700 Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info The patch adds name and load_time to struct bpf_prog_aux. They are also exported to bpf_prog_info. The bpf_prog's name is passed by userspace during BPF_PROG_LOAD. The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes and the name stored in the kernel is always \0 terminated. The kernel will reject name that contains characters other than isalnum() and '_'. It will also reject name that is not null terminated. The existing 'user->uid' of the bpf_prog_aux is also exported to the bpf_prog_info as created_by_uid. The existing 'used_maps' of the bpf_prog_aux is exported to the newly added members 'nr_map_ids' and 'map_ids' of the bpf_prog_info. On the input, nr_map_ids tells how big the userspace's map_ids buffer is. On the output, nr_map_ids tells the exact user_map_cnt and it will only copy up to the userspace's map_ids buffer is allowed. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 25d074920a00..45970df3f820 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -312,6 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL +/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. + * Return 0 on success and < 0 on error. + */ +static int bpf_obj_name_cpy(char *dst, const char *src) +{ + const char *end = src + BPF_OBJ_NAME_LEN; + + /* Copy all isalnum() and '_' char */ + while (src < end && *src) { + if (!isalnum(*src) && *src != '_') + return -EINVAL; + *dst++ = *src++; + } + + /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + if (src == end) + return -EINVAL; + + /* '\0' terminates dst */ + *dst = 0; + + return 0; +} + #define BPF_MAP_CREATE_LAST_FIELD numa_node /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -973,7 +1000,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_flags +#define BPF_PROG_LOAD_LAST_FIELD prog_name static int bpf_prog_load(union bpf_attr *attr) { @@ -1037,6 +1064,11 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_prog; + prog->aux->load_time = ktime_get_boot_ns(); + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); + if (err) + goto free_prog; + /* run eBPF verifier */ err = bpf_check(&prog, attr); if (err < 0) @@ -1358,8 +1390,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.type = prog->type; info.id = prog->aux->id; + info.load_time = prog->aux->load_time; + info.created_by_uid = from_kuid_munged(current_user_ns(), + prog->aux->user->uid); memcpy(info.tag, prog->tag, sizeof(prog->tag)); + memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + + ulen = info.nr_map_ids; + info.nr_map_ids = prog->aux->used_map_cnt; + ulen = min_t(u32, info.nr_map_ids, ulen); + if (ulen) { + u32 *user_map_ids = (u32 *)info.map_ids; + u32 i; + + for (i = 0; i < ulen; i++) + if (put_user(prog->aux->used_maps[i]->id, + &user_map_ids[i])) + return -EFAULT; + } if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; -- cgit v1.2.3 From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:53 -0700 Subject: bpf: Add map_name to bpf_map_info This patch allows userspace to specify a name for a map during BPF_MAP_CREATE. The map's name can later be exported to user space via BPF_OBJ_GET_INFO_BY_FD. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 45970df3f820..11a7f82a55d1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -339,7 +339,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD numa_node +#define BPF_MAP_CREATE_LAST_FIELD map_name /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -361,6 +361,10 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = bpf_obj_name_cpy(map->name, attr->map_name); + if (err) + goto free_map_nouncharge; + atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -1462,6 +1466,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; + memcpy(info.name, map->name, sizeof(map->name)); if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit v1.2.3 From 441430eb54a00586f95f1aefc48e0801bbd6a923 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 6 Sep 2017 19:08:11 +0300 Subject: perf/aux: Only update ->aux_wakeup in non-overwrite mode The following commit: d9a50b0256 ("perf/aux: Ensure aux_wakeup represents most recent wakeup index") changed the AUX wakeup position calculation to rounddown(), which causes a division-by-zero in AUX overwrite mode (aka "snapshot mode"). The zero denominator results from the fact that perf record doesn't set aux_watermark to anything, in which case the kernel will set it to half the AUX buffer size, but only for non-overwrite mode. In the overwrite mode aux_watermark stays zero. The good news is that, AUX overwrite mode, wakeups don't happen and related bookkeeping is not relevant, so we can simply forego the whole wakeup updates. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/20170906160811.16510-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12ee..f684d8e5fa2b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -412,6 +412,19 @@ err: return NULL; } +static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +{ + if (rb->aux_overwrite) + return false; + + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + return true; + } + + return false; +} + /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the @@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) } rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) wakeup = true; - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); - } if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) @@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } -- cgit v1.2.3 From 65d5dc47fe8530e17e318722fb4df676536c2bfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:14:08 +0200 Subject: sched/debug: Remove unused variable Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 01217fb5a5de..2f93e4a2d9f6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg) } #endif -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { -- cgit v1.2.3 From 9c29c31830a4eca724e137a9339137204bbb31be Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Thu, 7 Sep 2017 20:00:58 +0530 Subject: locking/rwsem-xadd: Fix missed wakeup due to reordering of load If a spinner is present, there is a chance that the load of rwsem_has_spinner() in rwsem_wake() can be reordered with respect to decrement of rwsem count in __up_write() leading to wakeup being missed: spinning writer up_write caller --------------- ----------------------- [S] osq_unlock() [L] osq spin_lock(wait_lock) sem->count=0xFFFFFFFF00000001 +0xFFFFFFFF00000000 count=sem->count MB sem->count=0xFFFFFFFE00000001 -0xFFFFFFFF00000001 spin_trylock(wait_lock) return rwsem_try_write_lock(count) spin_unlock(wait_lock) schedule() Reordering of atomic_long_sub_return_release() in __up_write() and rwsem_has_spinner() in rwsem_wake() can cause missing of wakeup in up_write() context. In spinning writer, sem->count and local variable count is 0XFFFFFFFE00000001. It would result in rwsem_try_write_lock() failing to acquire rwsem and spinning writer going to sleep in rwsem_down_write_failed(). The smp_rmb() will make sure that the spinner state is consulted after sem->count is updated in up_write context. Signed-off-by: Prateek Sood Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: longman@redhat.com Cc: parri.andrea@gmail.com Cc: sramana@codeaurora.org Link: http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 02f660666ab8..1fefe6dcafd7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -612,6 +612,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) unsigned long flags; DEFINE_WAKE_Q(wake_q); + /* + * __rwsem_down_write_failed_common(sem) + * rwsem_optimistic_spin(sem) + * osq_unlock(sem->osq) + * ... + * atomic_long_add_return(&sem->count) + * + * - VS - + * + * __up_write() + * if (atomic_long_sub_return_release(&sem->count) < 0) + * rwsem_wake(sem) + * osq_is_locked(&sem->osq) + * + * And __up_write() must observe !osq_is_locked() when it observes the + * atomic_long_add_return() in order to not miss a wakeup. + * + * This boils down to: + * + * [S.rel] X = 1 [RmW] r0 = (Y += 0) + * MB RMB + * [RmW] Y += 1 [L] r1 = X + * + * exists (r0=1 /\ r1=0) + */ + smp_rmb(); + /* * If a spinner is present, it is not necessary to do the wakeup. * Try to do wakeup only if the trylock succeeds to minimize -- cgit v1.2.3 From 5f6ad26ea353fdf3dad2328052cbee49e0b9c5b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:23:31 +0200 Subject: sched/tracing: Use common task-state helpers Remove yet another task-state char instance. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 21 ++++++--------------- kernel/trace/trace_sched_wakeup.c | 8 ++++---- 2 files changed, 10 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); + T = __task_state_to_char(field->next_state); + S = __task_state_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..0c331978b1a6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = prev->state; + entry->prev_state = __get_task_state(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = next->state; + entry->next_state = __get_task_state(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = curr->state; + entry->prev_state = __get_task_state(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = wakee->state; + entry->next_state = __get_task_state(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) -- cgit v1.2.3 From 5d68cc95fb24b2f58060cc5340dd7402a11f054e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:32:41 +0200 Subject: sched/debug: Ignore TASK_IDLE for SysRq-W Markus reported that tasks in TASK_IDLE state are reported by SysRq-W, which results in undesirable clutter. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..d17c5da523a0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p) put_task_stack(p); } +static inline bool +state_filter_match(unsigned long state_filter, struct task_struct *p) +{ + /* no filter, everything matches */ + if (!state_filter) + return true; + + /* filter, but doesn't match */ + if (!(p->state & state_filter)) + return false; + + /* + * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows + * TASK_KILLABLE). + */ + if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + return false; + + return true; +} + + void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; @@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter) */ touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); - if (!state_filter || (p->state & state_filter)) + if (state_filter_match(state_filter, p)) sched_show_task(p); } -- cgit v1.2.3 From 5ccba44ba118a5000cccc50076b0344632459779 Mon Sep 17 00:00:00 2001 From: Ethan Zhao Date: Mon, 4 Sep 2017 13:59:34 +0800 Subject: sched/sysctl: Check user input value of sysctl_sched_time_avg System will hang if user set sysctl_sched_time_avg to 0: [root@XXX ~]# sysctl kernel.sched_time_avg_ms=0 Stack traceback for pid 0 0xffff883f6406c600 0 0 1 3 R 0xffff883f6406cf50 *swapper/3 ffff883f7ccc3ae8 0000000000000018 ffffffff810c4dd0 0000000000000000 0000000000017800 ffff883f7ccc3d78 0000000000000003 ffff883f7ccc3bf8 ffffffff810c4fc9 ffff883f7ccc3c08 00000000810c5043 ffff883f7ccc3c08 Call Trace: [] ? update_group_capacity+0x110/0x200 [] ? update_sd_lb_stats+0x109/0x600 [] ? find_busiest_group+0x47/0x530 [] ? load_balance+0x194/0x900 [] ? update_rq_clock.part.83+0x1a/0xe0 [] ? rebalance_domains+0x152/0x290 [] ? run_rebalance_domains+0xdc/0x1d0 [] ? __do_softirq+0xfb/0x320 [] ? irq_exit+0x125/0x130 [] ? scheduler_ipi+0x97/0x160 [] ? smp_reschedule_interrupt+0x29/0x30 [] ? reschedule_interrupt+0x6e/0x80 [] ? cpuidle_enter_state+0xcc/0x230 [] ? cpuidle_enter_state+0x9c/0x230 [] ? cpuidle_enter+0x17/0x20 [] ? cpu_startup_entry+0x38c/0x420 [] ? start_secondary+0x173/0x1e0 Because divide-by-zero error happens in function: update_group_capacity() update_cpu_capacity() scale_rt_capacity() { ... total = sched_avg_period() + delta; used = div_u64(avg, total); ... } To fix this issue, check user input value of sysctl_sched_time_avg, keep it unchanged when hitting invalid input, and set the minimum limit of sysctl_sched_time_avg to 1 ms. Reported-by: James Puthukattukaran Signed-off-by: Ethan Zhao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efault@gmx.de Cc: ethan.kernel@gmail.com Cc: keescook@chromium.org Cc: mcgrof@kernel.org Cc: Link: http://lkml.kernel.org/r/1504504774-18253-1-git-send-email-ethan.zhao@oracle.com Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..423554ad3610 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, #ifdef CONFIG_SCHEDSTATS { -- cgit v1.2.3 From 5bce9db1894c998c5b85a34036d679ea6517668f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 29 Aug 2017 17:01:03 +0300 Subject: perf/core: Explain perf_sched_mutex To clarify why atomic_inc_return(&perf_sched_events) is not sufficient and a mutex is needed to order static branch enabling vs the atomic counter increment, this adds a comment with a short explanation. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170829140103.6563-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..5ee62714f9a6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9394,6 +9394,11 @@ static void account_event(struct perf_event *event) inc = true; if (inc) { + /* + * We need the mutex here because static_branch_enable() + * must complete *before* the perf_sched_count increment + * becomes visible. + */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; -- cgit v1.2.3 From 7c80cfc99b7bfdc92cee26f8008859f326f4a37f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 16:03:17 +0200 Subject: sched/fair: Clean up calc_cfs_shares() For consistencies sake, we should have only a single reading of tg->shares. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..048fbfb75523 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2694,9 +2694,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +static long calc_cfs_shares(struct cfs_rq *cfs_rq) { - long tg_weight, load, shares; + long tg_weight, tg_shares, load, shares; + struct task_group *tg = cfs_rq->tg; + + tg_shares = READ_ONCE(tg->shares); /* * This really should be: cfs_rq->avg.load_avg, but instead we use @@ -2711,7 +2714,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) tg_weight -= cfs_rq->tg_load_avg_contrib; tg_weight += load; - shares = (tg->shares * load); + shares = (tg_shares * load); if (tg_weight) shares /= tg_weight; @@ -2727,17 +2730,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) * case no task is runnable on a CPU MIN_SHARES=2 should be returned * instead of 0. */ - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; - - return shares; -} -# else /* CONFIG_SMP */ -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - return tg->shares; + return clamp_t(long, shares, MIN_SHARES, tg_shares); } # endif /* CONFIG_SMP */ @@ -2762,7 +2755,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); static void update_cfs_shares(struct sched_entity *se) { struct cfs_rq *cfs_rq = group_cfs_rq(se); - struct task_group *tg; long shares; if (!cfs_rq) @@ -2771,13 +2763,14 @@ static void update_cfs_shares(struct sched_entity *se) if (throttled_hierarchy(cfs_rq)) return; - tg = cfs_rq->tg; - #ifndef CONFIG_SMP - if (likely(se->load.weight == tg->shares)) + shares = READ_ONCE(cfs_rq->tg->shares); + + if (likely(se->load.weight == shares)) return; +#else + shares = calc_cfs_shares(cfs_rq); #endif - shares = calc_cfs_shares(cfs_rq, tg); reweight_entity(cfs_rq_of(se), se, shares); } -- cgit v1.2.3 From cef27403cbe98ebda0a32d43128dd60c309eb966 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 May 2017 11:04:07 +0200 Subject: sched/fair: Add comment to calc_cfs_shares() Explain the magic equation in calc_cfs_shares() a bit better. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 048fbfb75523..dd565aeafc5a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2694,6 +2694,67 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP +/* + * All this does is approximate the hierarchical proportion which includes that + * global sum we all love to hate. + * + * That is, the weight of a group entity, is the proportional share of the + * group weight based on the group runqueue weights. That is: + * + * tg->weight * grq->load.weight + * ge->load.weight = ----------------------------- (1) + * \Sum grq->load.weight + * + * Now, because computing that sum is prohibitively expensive to compute (been + * there, done that) we approximate it with this average stuff. The average + * moves slower and therefore the approximation is cheaper and more stable. + * + * So instead of the above, we substitute: + * + * grq->load.weight -> grq->avg.load_avg (2) + * + * which yields the following: + * + * tg->weight * grq->avg.load_avg + * ge->load.weight = ------------------------------ (3) + * tg->load_avg + * + * Where: tg->load_avg ~= \Sum grq->avg.load_avg + * + * That is shares_avg, and it is right (given the approximation (2)). + * + * The problem with it is that because the average is slow -- it was designed + * to be exactly that of course -- this leads to transients in boundary + * conditions. In specific, the case where the group was idle and we start the + * one task. It takes time for our CPU's grq->avg.load_avg to build up, + * yielding bad latency etc.. + * + * Now, in that special case (1) reduces to: + * + * tg->weight * grq->load.weight + * ge->load.weight = ----------------------------- = tg>weight (4) + * grp->load.weight + * + * That is, the sum collapses because all other CPUs are idle; the UP scenario. + * + * So what we do is modify our approximation (3) to approach (4) in the (near) + * UP case, like: + * + * ge->load.weight = + * + * tg->weight * grq->load.weight + * --------------------------------------------------- (5) + * tg->load_avg - grq->avg.load_avg + grq->load.weight + * + * + * And that is shares_weight and is icky. In the (near) UP case it approaches + * (4) while in the normal case it approaches (3). It consistently + * overestimates the ge->load.weight and therefore: + * + * \Sum ge->load.weight >= tg->weight + * + * hence icky! + */ static long calc_cfs_shares(struct cfs_rq *cfs_rq) { long tg_weight, tg_shares, load, shares; -- cgit v1.2.3 From 3d4b60d3e3dde6ea24e439000eb3b71078da81f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 May 2017 18:16:06 +0200 Subject: sched/fair: Cure calc_cfs_shares() vs. reweight_entity() Vincent reported that when running in a cgroup, his root cfs_rq->avg.load_avg dropped to 0 on task idle. This is because reweight_entity() will now immediately propagate the weight change of the group entity to its cfs_rq, and as it happens, our approxmation (5) for calc_cfs_shares() results in 0 when the group is idle. Avoid this by using the correct (3) as a lower bound on (5). This way the empty cgroup will slowly decay instead of instantly drop to 0. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd565aeafc5a..63166a0ed854 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2763,11 +2763,10 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) tg_shares = READ_ONCE(tg->shares); /* - * This really should be: cfs_rq->avg.load_avg, but instead we use - * cfs_rq->load.weight, which is its upper bound. This helps ramp up - * the shares for small weight interactive tasks. + * Because (5) drops to 0 when the cfs_rq is idle, we need to use (3) + * as a lower bound. */ - load = scale_load_down(cfs_rq->load.weight); + load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); tg_weight = atomic_long_read(&tg->load_avg); -- cgit v1.2.3 From c7b50216818ef3dca14a52e3499750fbad2d9691 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 16:42:08 +0200 Subject: sched/fair: Remove se->load.weight from se->avg.load_sum Remove the load from the load_sum for sched_entities, basically turning load_sum into runnable_sum. This prepares for better reweighting of group entities. Since we now have different rules for computing load_avg, split ___update_load_avg() into two parts, ___update_load_sum() and ___update_load_avg(). So for se: ___update_load_sum(.weight = 1) ___upate_load_avg(.weight = se->load.weight) and for cfs_rq: ___update_load_sum(.weight = cfs_rq->load.weight) ___upate_load_avg(.weight = 1) Since the primary consumable is load_avg, most things will not be affected. Only those few sites that initialize/modify load_sum need attention. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 91 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 63166a0ed854..3b5b82345774 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -731,7 +731,7 @@ void init_entity_runnable_average(struct sched_entity *se) */ if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); - sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + sa->load_sum = LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway */ @@ -2025,7 +2025,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.load_sum / p->se.load.weight; + delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX; } @@ -3018,8 +3018,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -___update_load_avg(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) +___update_load_sum(u64 now, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { u64 delta; @@ -3065,39 +3065,80 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) return 0; + return 1; +} + +static __always_inline void +___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq) +{ + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); + sa->load_avg = div_u64(weight * sa->load_sum, divider); if (cfs_rq) { cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); + div_u64(cfs_rq->runnable_load_sum, divider); } - sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); + sa->util_avg = sa->util_sum / divider; +} - return 1; +/* + * XXX we want to get rid of this helper and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); } +/* + * sched_entity: + * + * load_sum := runnable_sum + * load_avg = se_weight(se) * runnable_avg + * + * cfq_rs: + * + * load_sum = \Sum se_weight(se) * se->avg.load_sum + * load_avg = \Sum se->avg.load_avg + */ + static int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); + if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) { + ___update_load_avg(&se->avg, se_weight(se), NULL); + return 1; + } + + return 0; } static int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - return ___update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, + cfs_rq->curr == se, NULL)) { + + ___update_load_avg(&se->avg, se_weight(se), NULL); + return 1; + } + + return 0; } static int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) { - return ___update_load_avg(now, cpu, &cfs_rq->avg, - scale_load_down(cfs_rq->load.weight), - cfs_rq->curr != NULL, cfs_rq); + if (___update_load_sum(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + cfs_rq->curr != NULL, cfs_rq)) { + ___update_load_avg(&cfs_rq->avg, 1, cfs_rq); + return 1; + } + + return 0; } /* @@ -3267,7 +3308,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) /* Set new sched_entity's load */ se->avg.load_avg = load; - se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + se->avg.load_sum = LOAD_AVG_MAX; /* Update parent cfs_rq load */ add_positive(&cfs_rq->avg.load_avg, delta); @@ -3473,7 +3514,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s { se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se->avg.load_sum; + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; set_tg_cfs_propagate(cfs_rq); @@ -3493,7 +3534,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s { sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); set_tg_cfs_propagate(cfs_rq); @@ -3505,12 +3546,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct sched_avg *sa = &se->avg; - - cfs_rq->runnable_load_avg += sa->load_avg; - cfs_rq->runnable_load_sum += sa->load_sum; + cfs_rq->runnable_load_avg += se->avg.load_avg; + cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; - if (!sa->last_update_time) { + if (!se->avg.last_update_time) { attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, 0); } @@ -3520,10 +3559,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->runnable_load_avg = - max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); - cfs_rq->runnable_load_sum = - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); + sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); } #ifndef CONFIG_64BIT -- cgit v1.2.3 From 88c0616ee729067ecb412bed76ef4a8734ea5100 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 17:32:43 +0200 Subject: sched/fair: Change update_load_avg() arguments Most call sites of update_load_avg() already have cfs_rq_of(se) available, pass it down instead of recomputing it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b5b82345774..a6c53580fea0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3480,9 +3480,8 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #define SKIP_AGE_LOAD 0x2 /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int flags) +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq); @@ -3643,9 +3642,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 -static inline void update_load_avg(struct sched_entity *se, int not_used1) +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { - cfs_rq_util_change(cfs_rq_of(se)); + cfs_rq_util_change(cfs_rq); } static inline void @@ -3796,7 +3795,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); @@ -3880,7 +3879,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); @@ -3968,7 +3967,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -4070,7 +4069,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_load_avg(prev, 0); + update_load_avg(cfs_rq, prev, 0); } cfs_rq->curr = NULL; } @@ -4086,7 +4085,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, UPDATE_TG); + update_load_avg(cfs_rq, curr, UPDATE_TG); update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK @@ -5004,7 +5003,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); update_cfs_shares(se); } @@ -5063,7 +5062,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); update_cfs_shares(se); } @@ -7121,7 +7120,7 @@ static void update_blocked_averages(int cpu) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(se, 0); + update_load_avg(cfs_rq_of(se), se, 0); /* * There can be a lot of idle CPU cgroups. Don't let fully @@ -9295,7 +9294,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); } } #else @@ -9307,7 +9306,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* Catch up with the cfs_rq and remove our load when we leave */ - update_load_avg(se, 0); + update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); @@ -9326,7 +9325,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) #endif /* Synchronize entity with its cfs_rq */ - update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); @@ -9610,7 +9609,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); for_each_sched_entity(se) { - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); update_cfs_shares(se); } rq_unlock_irqrestore(rq, &rf); -- cgit v1.2.3 From b382a531b9fece229c8358a9fb79431cddcf93c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 17:37:03 +0200 Subject: sched/fair: Move enqueue migrate handling Move the entity migrate handling from enqueue_entity_load_avg() to update_load_avg(). This has two benefits: - {en,de}queue_entity_load_avg() will become purely about managing runnable_load - we can avoid a double update_tg_load_avg() and reduce pressure on the global tg->shares cacheline The reason we do this is so that we can change update_cfs_shares() to change both weight and (future) runnable_weight. For this to work we need to have the cfs_rq averages up-to-date (which means having done the attach), but we need the cfs_rq->avg.runnable_avg to not yet include the se's contribution (since se->on_rq == 0). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 70 +++++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a6c53580fea0..605b7c3e1ab8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3473,34 +3473,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) return decayed || removed_load; } -/* - * Optional action to be done while updating the load average - */ -#define UPDATE_TG 0x1 -#define SKIP_AGE_LOAD 0x2 - -/* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ - u64 now = cfs_rq_clock_task(cfs_rq); - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); - int decayed; - - /* - * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration - */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) - __update_load_avg_se(now, cpu, cfs_rq, se); - - decayed = update_cfs_rq_load_avg(now, cfs_rq); - decayed |= propagate_entity_load_avg(se); - - if (decayed && (flags & UPDATE_TG)) - update_tg_load_avg(cfs_rq, 0); -} - /** * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to @@ -3541,17 +3513,46 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq_util_change(cfs_rq); } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 +#define DO_ATTACH 0x4 + +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + u64 now = cfs_rq_clock_task(cfs_rq); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + int decayed; + + /* + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration + */ + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); + + decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed |= propagate_entity_load_avg(se); + + if (!se->avg.last_update_time && (flags & DO_ATTACH)) { + + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, 0); + + } else if (decayed && (flags & UPDATE_TG)) + update_tg_load_avg(cfs_rq, 0); +} + /* Add the load generated by se into cfs_rq's load average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { cfs_rq->runnable_load_avg += se->avg.load_avg; cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; - - if (!se->avg.last_update_time) { - attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, 0); - } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ @@ -3641,6 +3642,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 +#define DO_ATTACH 0x0 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { @@ -3795,7 +3797,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); enqueue_entity_load_avg(cfs_rq, se); update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); -- cgit v1.2.3 From b5b3e35f4149df72aaba612bba195fb2ec37b1b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Aug 2017 17:38:30 +0200 Subject: sched/fair: Rename {en,de}queue_entity_load_avg() Since they're now purely about runnable_load, rename them. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 605b7c3e1ab8..fad77c8454e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3549,7 +3549,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* Add the load generated by se into cfs_rq's load average */ static inline void -enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { cfs_rq->runnable_load_avg += se->avg.load_avg; cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; @@ -3557,7 +3557,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void -dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); @@ -3650,9 +3650,9 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s } static inline void -enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void -dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void @@ -3798,7 +3798,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); - enqueue_entity_load_avg(cfs_rq, se); + enqueue_runnable_load_avg(cfs_rq, se); update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); @@ -3882,7 +3882,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * of its group cfs_rq. */ update_load_avg(cfs_rq, se, UPDATE_TG); - dequeue_entity_load_avg(cfs_rq, se); + dequeue_runnable_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); -- cgit v1.2.3 From 8d5b9025f9b4500f828260dc62e8ffa823ce0d59 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Aug 2017 17:45:35 +0200 Subject: sched/fair: Introduce {en,de}queue_load_avg() Analogous to the existing {en,de}queue_runnable_load_avg() add helpers for {en,de}queue_load_avg(). More users will follow. Includes some code movement to avoid fwd declarations. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 156 +++++++++++++++++++++++++++++----------------------- 1 file changed, 86 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fad77c8454e4..654d8e3d6047 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2692,6 +2692,90 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->nr_running--; } +/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +#ifdef CONFIG_SMP +/* + * XXX we want to get rid of this helper and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline void +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->runnable_load_avg += se->avg.load_avg; + cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; +} + +static inline void +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); +} + +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; +} + +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); +} +#else +static inline void +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP /* @@ -3084,14 +3168,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cf sa->util_avg = sa->util_sum / divider; } -/* - * XXX we want to get rid of this helper and use the full load resolution. - */ -static inline long se_weight(struct sched_entity *se) -{ - return scale_load_down(se->load.weight); -} - /* * sched_entity: * @@ -3141,26 +3217,6 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) return 0; } -/* - * Signed add and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define add_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(_val) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - \ - res = var + val; \ - \ - if (val < 0 && res > var) \ - res = 0; \ - \ - WRITE_ONCE(*ptr, res); \ -} while (0) - #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -3405,23 +3461,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -/* - * Unsigned subtract and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define sub_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(*ptr) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - res = var - val; \ - if (res > var) \ - res = 0; \ - WRITE_ONCE(*ptr, res); \ -} while (0) - /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_task() @@ -3484,8 +3523,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { se->avg.last_update_time = cfs_rq->avg.last_update_time; - cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; + enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; set_tg_cfs_propagate(cfs_rq); @@ -3503,9 +3541,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - - sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); set_tg_cfs_propagate(cfs_rq); @@ -3547,22 +3583,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s update_tg_load_avg(cfs_rq, 0); } -/* Add the load generated by se into cfs_rq's load average */ -static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_load_avg += se->avg.load_avg; - cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; -} - -/* Remove the runnable load generated by se from cfs_rq's runnable load average */ -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); -} - #ifndef CONFIG_64BIT static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) { @@ -3649,10 +3669,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq_util_change(cfs_rq); } -static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -- cgit v1.2.3 From 840c5abca499a858619954dbcffc82110bb6e076 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 16:11:34 +0200 Subject: sched/fair: More accurate reweight_entity() When a (group) entity changes it's weight we should instantly change its load_avg and propagate that change into the sums it is part of. Because we use these values to predict future behaviour and are not interested in its historical value. Without this change, the change in load would need to propagate through the average, by which time it could again have changed etc.. always chasing itself. With this change, the cfs_rq load_avg sum will more accurately reflect the current runnable and expected return of blocked load. Reported-by: Paul Turner [josef: compile fix !SMP || !FAIR_GROUP] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 654d8e3d6047..750ae4dbf812 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2886,12 +2886,22 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (cfs_rq->curr == se) update_curr(cfs_rq); account_entity_dequeue(cfs_rq, se); + dequeue_runnable_load_avg(cfs_rq, se); } + dequeue_load_avg(cfs_rq, se); update_load_set(&se->load, weight); - if (se->on_rq) +#ifdef CONFIG_SMP + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, + LOAD_AVG_MAX - 1024 + se->avg.period_contrib); +#endif + + enqueue_load_avg(cfs_rq, se); + if (se->on_rq) { account_entity_enqueue(cfs_rq, se); + enqueue_runnable_load_avg(cfs_rq, se); + } } static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -- cgit v1.2.3 From 9059393e4ec1c8c6623a120b405ef2c90b968d80 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 17 May 2017 11:50:45 +0200 Subject: sched/fair: Use reweight_entity() for set_user_nice() Now that we directly change load_avg and propagate that change into the sums, sys_nice() and co should do the same, otherwise its possible to confuse load accounting when we migrate near the weight change. Fixes-by: Josef Bacik Signed-off-by: Vincent Guittot [ Added changelog, fixed the call condition. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170517095045.GA8420@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 ++++++++++++------ kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++---------------------- kernel/sched/sched.h | 2 ++ 3 files changed, 54 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..2288a145bf01 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -733,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -747,8 +747,16 @@ static void set_load_weight(struct task_struct *p) return; } - load->weight = scale_load(sched_prio_to_weight[prio]); - load->inv_weight = sched_prio_to_wmult[prio]; + /* + * SCHED_OTHER tasks have to update their load when changing their + * weight + */ + if (update_load && p->sched_class == &fair_sched_class) { + reweight_task(p, prio); + } else { + load->weight = scale_load(sched_prio_to_weight[prio]); + load->inv_weight = sched_prio_to_wmult[prio]; + } } static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -2358,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = __normal_prio(p); - set_load_weight(p); + set_load_weight(p, false); /* * We don't need the reset flag anymore after the fork. It has @@ -3805,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); delta = p->prio - old_prio; @@ -3962,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); } /* Actually do priority change: must hold pi & rq lock. */ @@ -5933,7 +5941,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } - set_load_weight(&init_task); + set_load_weight(&init_task, false); /* * The boot idle thread does lazy MMU switching as well: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 750ae4dbf812..d8c02b31498d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2776,6 +2776,43 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + dequeue_runnable_load_avg(cfs_rq, se); + } + dequeue_load_avg(cfs_rq, se); + + update_load_set(&se->load, weight); + +#ifdef CONFIG_SMP + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, + LOAD_AVG_MAX - 1024 + se->avg.period_contrib); +#endif + + enqueue_load_avg(cfs_rq, se); + if (se->on_rq) { + account_entity_enqueue(cfs_rq, se); + enqueue_runnable_load_avg(cfs_rq, se); + } +} + +void reweight_task(struct task_struct *p, int prio) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct load_weight *load = &se->load; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + reweight_entity(cfs_rq, se, weight); + load->inv_weight = sched_prio_to_wmult[prio]; +} + #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP /* @@ -2878,32 +2915,6 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) } # endif /* CONFIG_SMP */ -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) -{ - if (se->on_rq) { - /* commit outstanding execution time */ - if (cfs_rq->curr == se) - update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); - dequeue_runnable_load_avg(cfs_rq, se); - } - dequeue_load_avg(cfs_rq, se); - - update_load_set(&se->load, weight); - -#ifdef CONFIG_SMP - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, - LOAD_AVG_MAX - 1024 + se->avg.period_contrib); -#endif - - enqueue_load_avg(cfs_rq, se); - if (se->on_rq) { - account_entity_enqueue(cfs_rq, se); - enqueue_runnable_load_avg(cfs_rq, se); - } -} - static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); static void update_cfs_shares(struct sched_entity *se) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14db76cd496f..a5d97460ee4e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1529,6 +1529,8 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); +extern void reweight_task(struct task_struct *p, int prio); + extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); -- cgit v1.2.3 From 2a2f5d4e44ed160a5ed822c94e04f918f9fbb487 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 May 2017 16:51:41 +0200 Subject: sched/fair: Rewrite cfs_rq->removed_*avg Since on wakeup migration we don't hold the rq->lock for the old CPU we cannot update its state. Instead we add the removed 'load' to an atomic variable and have the next update on that CPU collect and process it. Currently we have 2 atomic variables; which already have the issue that they can be read out-of-sync. Also, two atomic ops on a single cacheline is already more expensive than an uncontended lock. Since we want to add more, convert the thing over to an explicit cacheline with a lock in. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 8 ++++---- kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++---------------- kernel/sched/sched.h | 13 +++++++++---- 3 files changed, 48 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f93e4a2d9f6..2f22342c48ff 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -564,10 +564,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); - SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", - atomic_long_read(&cfs_rq->removed_load_avg)); - SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", - atomic_long_read(&cfs_rq->removed_util_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", + cfs_rq->removed.load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", + cfs_rq->removed.util_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d8c02b31498d..fe4a66b10480 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3501,36 +3501,47 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { + unsigned long removed_load = 0, removed_util = 0; struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed_load = 0, removed_util = 0; + int decayed = 0; - if (atomic_long_read(&cfs_rq->removed_load_avg)) { - s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + if (cfs_rq->removed.nr) { + unsigned long r; + + raw_spin_lock(&cfs_rq->removed.lock); + swap(cfs_rq->removed.util_avg, removed_util); + swap(cfs_rq->removed.load_avg, removed_load); + cfs_rq->removed.nr = 0; + raw_spin_unlock(&cfs_rq->removed.lock); + + /* + * The LOAD_AVG_MAX for _sum is a slight over-estimate, + * which is safe due to sub_positive() clipping at 0. + */ + r = removed_load; sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); - removed_load = 1; - set_tg_cfs_propagate(cfs_rq); - } - if (atomic_long_read(&cfs_rq->removed_util_avg)) { - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + r = removed_util; sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); - removed_util = 1; + set_tg_cfs_propagate(cfs_rq); + + decayed = 1; } - decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); + decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed || removed_util) + if (decayed) cfs_rq_util_change(cfs_rq); - return decayed || removed_load; + return decayed; } /** @@ -3645,6 +3656,7 @@ void sync_entity_load_avg(struct sched_entity *se) void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); + unsigned long flags; /* * tasks cannot exit without having gone through wake_up_new_task() -> @@ -3654,11 +3666,19 @@ void remove_entity_load_avg(struct sched_entity *se) * Similarly for groups, they will have passed through * post_init_entity_util_avg() before unregister_sched_fair_group() * calls this. + * + * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING + * we could actually get the right time, since we're called with + * rq->lock held, see detach_task(). */ sync_entity_load_avg(se); - atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); + ++cfs_rq->removed.nr; + cfs_rq->removed.util_avg += se->avg.util_avg; + cfs_rq->removed.load_avg += se->avg.load_avg; + raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) @@ -9449,8 +9469,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->propagate_avg = 0; #endif - atomic_long_set(&cfs_rq->removed_load_avg, 0); - atomic_long_set(&cfs_rq->removed_util_avg, 0); + raw_spin_lock_init(&cfs_rq->removed.lock); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a5d97460ee4e..2fd350a12bb7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -445,14 +445,19 @@ struct cfs_rq { struct sched_avg avg; u64 runnable_load_sum; unsigned long runnable_load_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; unsigned long propagate_avg; #endif - atomic_long_t removed_load_avg, removed_util_avg; -#ifndef CONFIG_64BIT - u64 load_last_update_time_copy; -#endif + struct { + raw_spinlock_t lock ____cacheline_aligned; + int nr; + unsigned long load_avg; + unsigned long util_avg; + } removed; #ifdef CONFIG_FAIR_GROUP_SCHED /* -- cgit v1.2.3 From 0e2d2aaaae52c247c047d14999b93486bdbd3431 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 May 2017 17:30:46 +0200 Subject: sched/fair: Rewrite PELT migration propagation When an entity migrates in (or out) of a runqueue, we need to add (or remove) its contribution from the entire PELT hierarchy, because even non-runnable entities are included in the load average sums. In order to do this we have some propagation logic that updates the PELT tree, however the way it 'propagates' the runnable (or load) change is (more or less): tg->weight * grq->avg.load_avg ge->avg.load_avg = ------------------------------ tg->load_avg But that is the expression for ge->weight, and per the definition of load_avg: ge->avg.load_avg := ge->weight * ge->avg.runnable_avg That destroys the runnable_avg (by setting it to 1) we wanted to propagate. Instead directly propagate runnable_sum. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 + kernel/sched/fair.c | 186 +++++++++++++++++++++++++++++---------------------- kernel/sched/sched.h | 9 +-- 3 files changed, 112 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f22342c48ff..2e039a81864c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -568,6 +568,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", cfs_rq->removed.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", + cfs_rq->removed.runnable_sum); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe4a66b10480..086a5d979720 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3319,11 +3319,77 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } -/* Take into account change of utilization of a child task group */ + +/* + * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to + * propagate its contribution. The key to this propagation is the invariant + * that for each group: + * + * ge->avg == grq->avg (1) + * + * _IFF_ we look at the pure running and runnable sums. Because they + * represent the very same entity, just at different points in the hierarchy. + * + * + * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and + * simply copies the running sum over. + * + * However, update_tg_cfs_runnable() is more complex. So we have: + * + * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) + * + * And since, like util, the runnable part should be directly transferable, + * the following would _appear_ to be the straight forward approach: + * + * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) + * + * And per (1) we have: + * + * ge->avg.running_avg == grq->avg.running_avg + * + * Which gives: + * + * ge->load.weight * grq->avg.load_avg + * ge->avg.load_avg = ----------------------------------- (4) + * grq->load.weight + * + * Except that is wrong! + * + * Because while for entities historical weight is not important and we + * really only care about our future and therefore can consider a pure + * runnable sum, runqueues can NOT do this. + * + * We specifically want runqueues to have a load_avg that includes + * historical weights. Those represent the blocked load, the load we expect + * to (shortly) return to us. This only works by keeping the weights as + * integral part of the sum. We therefore cannot decompose as per (3). + * + * OK, so what then? + * + * + * Another way to look at things is: + * + * grq->avg.load_avg = \Sum se->avg.load_avg + * + * Therefore, per (2): + * + * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg + * + * And the very thing we're propagating is a change in that sum (someone + * joined/left). So we can easily know the runnable change, which would be, per + * (2) the already tracked se->load_avg divided by the corresponding + * se->weight. + * + * Basically (4) but in differential form: + * + * d(runnable_avg) += se->avg.load_avg / se->load.weight + * (5) + * ge->avg.load_avg += ge->load.weight * d(runnable_avg) + */ + static inline void -update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - struct cfs_rq *gcfs_rq = group_cfs_rq(se); long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; /* Nothing to update */ @@ -3339,102 +3405,59 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; } -/* Take into account change of load of a child task group */ static inline void -update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - struct cfs_rq *gcfs_rq = group_cfs_rq(se); - long delta, load = gcfs_rq->avg.load_avg; + long runnable_sum = gcfs_rq->prop_runnable_sum; + long load_avg; + s64 load_sum; - /* - * If the load of group cfs_rq is null, the load of the - * sched_entity will also be null so we can skip the formula - */ - if (load) { - long tg_load; - - /* Get tg's load and ensure tg_load > 0 */ - tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; - - /* Ensure tg_load >= load and updated with current load*/ - tg_load -= gcfs_rq->tg_load_avg_contrib; - tg_load += load; - - /* - * We need to compute a correction term in the case that the - * task group is consuming more CPU than a task of equal - * weight. A task with a weight equals to tg->shares will have - * a load less or equal to scale_load_down(tg->shares). - * Similarly, the sched_entities that represent the task group - * at parent level, can't have a load higher than - * scale_load_down(tg->shares). And the Sum of sched_entities' - * load must be <= scale_load_down(tg->shares). - */ - if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { - /* scale gcfs_rq's load into tg's shares*/ - load *= scale_load_down(gcfs_rq->tg->shares); - load /= tg_load; - } - } + if (!runnable_sum) + return; - delta = load - se->avg.load_avg; + gcfs_rq->prop_runnable_sum = 0; - /* Nothing to update */ - if (!delta) - return; + load_sum = (s64)se_weight(se) * runnable_sum; + load_avg = div_s64(load_sum, LOAD_AVG_MAX); - /* Set new sched_entity's load */ - se->avg.load_avg = load; - se->avg.load_sum = LOAD_AVG_MAX; + add_positive(&se->avg.load_sum, runnable_sum); + add_positive(&se->avg.load_avg, load_avg); - /* Update parent cfs_rq load */ - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + add_positive(&cfs_rq->avg.load_avg, load_avg); + add_positive(&cfs_rq->avg.load_sum, load_sum); - /* - * If the sched_entity is already enqueued, we also have to update the - * runnable load avg. - */ if (se->on_rq) { - /* Update parent cfs_rq runnable_load_avg */ - add_positive(&cfs_rq->runnable_load_avg, delta); - cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + add_positive(&cfs_rq->runnable_load_avg, load_avg); + add_positive(&cfs_rq->runnable_load_sum, load_sum); } } -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) { - cfs_rq->propagate_avg = 1; -} - -static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = group_cfs_rq(se); - - if (!cfs_rq->propagate_avg) - return 0; - - cfs_rq->propagate_avg = 0; - return 1; + cfs_rq->propagate = 1; + cfs_rq->prop_runnable_sum += runnable_sum; } /* Update task and its cfs_rq load average */ static inline int propagate_entity_load_avg(struct sched_entity *se) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *gcfs_rq; if (entity_is_task(se)) return 0; - if (!test_and_clear_tg_cfs_propagate(se)) + gcfs_rq = group_cfs_rq(se); + if (!gcfs_rq->propagate) return 0; + gcfs_rq->propagate = 0; + cfs_rq = cfs_rq_of(se); - set_tg_cfs_propagate(cfs_rq); + add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); - update_tg_cfs_util(cfs_rq, se); - update_tg_cfs_load(cfs_rq, se); + update_tg_cfs_util(cfs_rq, se, gcfs_rq); + update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); return 1; } @@ -3458,7 +3481,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) * If there is a pending propagation, we have to update the load and * the utilization of the sched_entity: */ - if (gcfs_rq->propagate_avg) + if (gcfs_rq->propagate) return false; /* @@ -3478,7 +3501,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 0; } -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3501,7 +3524,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - unsigned long removed_load = 0, removed_util = 0; + unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; @@ -3511,6 +3534,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); swap(cfs_rq->removed.load_avg, removed_load); + swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); @@ -3526,7 +3550,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); - set_tg_cfs_propagate(cfs_rq); + add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); decayed = 1; } @@ -3558,7 +3582,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; - set_tg_cfs_propagate(cfs_rq); + + add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); cfs_rq_util_change(cfs_rq); } @@ -3576,7 +3601,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); - set_tg_cfs_propagate(cfs_rq); + + add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); cfs_rq_util_change(cfs_rq); } @@ -3678,6 +3704,7 @@ void remove_entity_load_avg(struct sched_entity *se) ++cfs_rq->removed.nr; cfs_rq->removed.util_avg += se->avg.util_avg; cfs_rq->removed.load_avg += se->avg.load_avg; + cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } @@ -9466,9 +9493,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->propagate_avg = 0; -#endif raw_spin_lock_init(&cfs_rq->removed.lock); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2fd350a12bb7..5bcb86eb026b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -447,19 +447,20 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; -#endif -#ifdef CONFIG_FAIR_GROUP_SCHED - unsigned long tg_load_avg_contrib; - unsigned long propagate_avg; #endif struct { raw_spinlock_t lock ____cacheline_aligned; int nr; unsigned long load_avg; unsigned long util_avg; + unsigned long runnable_sum; } removed; #ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; + /* * h_load = weight * f(tg) * -- cgit v1.2.3 From 1ea6c46a23f1213d1972bfae220db5c165e27bba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 15:59:54 +0200 Subject: sched/fair: Propagate an effective runnable_load_avg The load balancer uses runnable_load_avg as load indicator. For !cgroup this is: runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq That is, a direct sum of all runnable tasks on that runqueue. As opposed to load_avg, which is a sum of all tasks on the runqueue, which includes a blocked component. However, in the cgroup case, this comes apart since the group entities are always runnable, even if most of their constituent entities are blocked. Therefore introduce a runnable_weight which for task entities is the same as the regular weight, but for group entities is a fraction of the entity weight and represents the runnable part of the group runqueue. Then propagate this load through the PELT hierarchy to arrive at an effective runnable load avgerage -- which we should not confuse with the canonical runnable load average. Suggested-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 8 ++- kernel/sched/fair.c | 172 +++++++++++++++++++++++++++++++++------------------ kernel/sched/sched.h | 3 +- 3 files changed, 121 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2e039a81864c..1ca0130ed4f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P_SCHEDSTAT(se->statistics.wait_count); } P(se->load.weight); + P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); + P(se->avg.runnable_load_avg); #endif #undef PN_SCHEDSTAT @@ -558,10 +560,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->runnable_load_avg); + cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", @@ -1006,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); + P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); + P(se.avg.runnable_load_sum); P(se.avg.util_sum); P(se.avg.load_avg); + P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 086a5d979720..10d2000fca2d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -730,8 +730,9 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->load_avg = scale_load_down(se->load.weight); - sa->load_sum = LOAD_AVG_MAX; + sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); + sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX; + /* * At this point, util_avg won't be used in select_task_rq_fair anyway */ @@ -2731,25 +2732,35 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_SMP /* - * XXX we want to get rid of this helper and use the full load resolution. + * XXX we want to get rid of these helpers and use the full load resolution. */ static inline long se_weight(struct sched_entity *se) { return scale_load_down(se->load.weight); } +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} + static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->runnable_load_avg += se->avg.load_avg; - cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; + cfs_rq->runnable_weight += se->runnable_weight; + + cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; + cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; } static inline void dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); + cfs_rq->runnable_weight -= se->runnable_weight; + + sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); + sub_positive(&cfs_rq->avg.runnable_load_sum, + se_runnable(se) * se->avg.runnable_load_sum); } static inline void @@ -2777,7 +2788,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) + unsigned long weight, unsigned long runnable) { if (se->on_rq) { /* commit outstanding execution time */ @@ -2788,11 +2799,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } dequeue_load_avg(cfs_rq, se); + se->runnable_weight = runnable; update_load_set(&se->load, weight); #ifdef CONFIG_SMP - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, - LOAD_AVG_MAX - 1024 + se->avg.period_contrib); + do { + u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); + se->avg.runnable_load_avg = + div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); + } while (0); #endif enqueue_load_avg(cfs_rq, se); @@ -2809,7 +2826,7 @@ void reweight_task(struct task_struct *p, int prio) struct load_weight *load = &se->load; unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight); + reweight_entity(cfs_rq, se, weight, weight); load->inv_weight = sched_prio_to_wmult[prio]; } @@ -2917,31 +2934,45 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct sched_entity *se) +/* + * Recomputes the group entity based on the current state of its group + * runqueue. + */ +static void update_cfs_group(struct sched_entity *se) { - struct cfs_rq *cfs_rq = group_cfs_rq(se); - long shares; + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long shares, runnable; - if (!cfs_rq) + if (!gcfs_rq) return; - if (throttled_hierarchy(cfs_rq)) + if (throttled_hierarchy(gcfs_rq)) return; #ifndef CONFIG_SMP - shares = READ_ONCE(cfs_rq->tg->shares); + runnable = shares = READ_ONCE(gcfs_rq->tg->shares); if (likely(se->load.weight == shares)) return; #else - shares = calc_cfs_shares(cfs_rq); + shares = calc_cfs_shares(gcfs_rq); + /* + * The hierarchical runnable load metric is the proportional part + * of this group's runnable_load_avg / load_avg. + * + * Note: we need to deal with very sporadic 'runnable > load' cases + * due to numerical instability. + */ + runnable = shares * gcfs_rq->avg.runnable_load_avg; + if (runnable) + runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg); #endif - reweight_entity(cfs_rq_of(se), se, shares); + reweight_entity(cfs_rq_of(se), se, shares, runnable); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct sched_entity *se) +static inline void update_cfs_group(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3050,7 +3081,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) */ static __always_inline u32 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long load, unsigned long runnable, int running) { unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ @@ -3067,10 +3098,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods); - } + sa->runnable_load_sum = + decay_load(sa->runnable_load_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -3083,11 +3112,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, sa->period_contrib = delta; contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } + if (load) + sa->load_sum += load * contrib; + if (runnable) + sa->runnable_load_sum += runnable * contrib; if (running) sa->util_sum += contrib * scale_cpu; @@ -3124,7 +3152,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ static __always_inline int ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -3157,8 +3185,8 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * this happens during idle_balance() which calls * update_blocked_averages() */ - if (!weight) - running = 0; + if (!load) + runnable = running = 0; /* * Now we know we crossed measurement unit boundaries. The *_avg @@ -3167,45 +3195,60 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) return 0; return 1; } static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq) +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) { u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(weight * sa->load_sum, divider); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, divider); - } + sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); sa->util_avg = sa->util_sum / divider; } /* * sched_entity: * + * task: + * se_runnable() == se_weight() + * + * group: [ see update_cfs_group() ] + * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * * load_sum := runnable_sum * load_avg = se_weight(se) * runnable_avg * + * runnable_load_sum := runnable_sum + * runnable_load_avg = se_runnable(se) * runnable_avg + * + * XXX collapse load_sum and runnable_load_sum + * * cfq_rs: * * load_sum = \Sum se_weight(se) * se->avg.load_sum * load_avg = \Sum se->avg.load_avg + * + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum + * runnable_load_avg = \Sum se->avg.runable_load_avg */ static int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) { - ___update_load_avg(&se->avg, se_weight(se), NULL); + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -3215,10 +3258,13 @@ __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) static int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, - cfs_rq->curr == se, NULL)) { + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + cfs_rq->curr == se)) { - ___update_load_avg(&se->avg, se_weight(se), NULL); + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -3230,8 +3276,10 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, cpu, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->curr != NULL, cfs_rq)) { - ___update_load_avg(&cfs_rq->avg, 1, cfs_rq); + scale_load_down(cfs_rq->runnable_weight), + cfs_rq->curr != NULL)) { + + ___update_load_avg(&cfs_rq->avg, 1, 1); return 1; } @@ -3409,8 +3457,8 @@ static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long runnable_sum = gcfs_rq->prop_runnable_sum; - long load_avg; - s64 load_sum; + long runnable_load_avg, load_avg; + s64 runnable_load_sum, load_sum; if (!runnable_sum) return; @@ -3426,9 +3474,15 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf add_positive(&cfs_rq->avg.load_avg, load_avg); add_positive(&cfs_rq->avg.load_sum, load_sum); + runnable_load_sum = (s64)se_runnable(se) * runnable_sum; + runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); + + add_positive(&se->avg.runnable_load_sum, runnable_sum); + add_positive(&se->avg.runnable_load_avg, runnable_load_avg); + if (se->on_rq) { - add_positive(&cfs_rq->runnable_load_avg, load_avg); - add_positive(&cfs_rq->runnable_load_sum, load_sum); + add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); + add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); } } @@ -3710,7 +3764,7 @@ void remove_entity_load_avg(struct sched_entity *se) static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) { - return cfs_rq->runnable_load_avg; + return cfs_rq->avg.runnable_load_avg; } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3882,8 +3936,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + update_cfs_group(se); enqueue_runnable_load_avg(cfs_rq, se); - update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) @@ -3989,7 +4043,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(se); + update_cfs_group(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -4172,7 +4226,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(cfs_rq, curr, UPDATE_TG); - update_cfs_shares(curr); + update_cfs_group(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -5090,7 +5144,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } if (!se) @@ -5149,7 +5203,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } if (!se) @@ -7174,7 +7228,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.util_sum) return false; - if (cfs_rq->runnable_load_sum) + if (cfs_rq->avg.runnable_load_sum) return false; return true; @@ -9692,7 +9746,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) update_rq_clock(rq); for_each_sched_entity(se) { update_load_avg(cfs_rq_of(se), se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } rq_unlock_irqrestore(rq, &rf); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5bcb86eb026b..e83d1b8be611 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -418,6 +418,7 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; + unsigned long runnable_weight; unsigned int nr_running, h_nr_running; u64 exec_clock; @@ -443,8 +444,6 @@ struct cfs_rq { * CFS load tracking */ struct sched_avg avg; - u64 runnable_load_sum; - unsigned long runnable_load_avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; #endif -- cgit v1.2.3 From 144d8487bc6e9b741895709cb46d4e19b748a725 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 May 2017 17:57:24 +0200 Subject: sched/fair: Implement synchonous PELT detach on load-balance migrate Vincent wondered why his self migrating task had a roughly 50% dip in load_avg when landing on the new CPU. This is because we uncondionally take the asynchronous detatch_entity route, which can lead to the attach on the new CPU still seeing the old CPU's contribution to tg->load_avg, effectively halving the new CPU's shares. While in general this is something we have to live with, there is the special case of runnable migration where we can do better. Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10d2000fca2d..92dbcc0fea46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3746,10 +3746,6 @@ void remove_entity_load_avg(struct sched_entity *se) * Similarly for groups, they will have passed through * post_init_entity_util_avg() before unregister_sched_fair_group() * calls this. - * - * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING - * we could actually get the right time, since we're called with - * rq->lock held, see detach_task(). */ sync_entity_load_avg(se); @@ -6292,6 +6288,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return new_cpu; } +static void detach_entity_cfs_rq(struct sched_entity *se); + /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -6325,14 +6323,25 @@ static void migrate_task_rq_fair(struct task_struct *p) se->vruntime -= min_vruntime; } - /* - * We are supposed to update the task to "current" time, then its up to date - * and ready to go to new CPU/cfs_rq. But we have difficulty in getting - * what current time is, so simply throw away the out-of-date time. This - * will result in the wakee task is less decayed, but giving the wakee more - * load sounds not bad. - */ - remove_entity_load_avg(&p->se); + if (p->on_rq == TASK_ON_RQ_MIGRATING) { + /* + * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' + * rq->lock and can modify state directly. + */ + lockdep_assert_held(&task_rq(p)->lock); + detach_entity_cfs_rq(&p->se); + + } else { + /* + * We are supposed to update the task to "current" time, then + * its up to date and ready to go to new CPU/cfs_rq. But we + * have difficulty in getting what current time is, so simply + * throw away the out-of-date time. This will result in the + * wakee task is less decayed, but giving the wakee more load + * sounds not bad. + */ + remove_entity_load_avg(&p->se); + } /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; -- cgit v1.2.3 From f207934fb79d1af1de1a62b09d56a3a1914172c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 May 2017 14:16:30 +0200 Subject: sched/fair: Align PELT windows between cfs_rq and its se The PELT _sum values are a saw-tooth function, dropping on the decay edge and then growing back up again during the window. When these window-edges are not aligned between cfs_rq and se, we can have the situation where, for example, on dequeue, the se decays first. Its _sum values will be small(er), while the cfs_rq _sum values will still be on their way up. Because of this, the subtraction: cfs_rq->avg._sum -= se->avg._sum will result in a positive value. This will then, once the cfs_rq reaches an edge, translate into its _avg value jumping up. This is especially visible with the runnable_load bits, since they get added/subtracted a lot. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 92dbcc0fea46..954b332cd899 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -716,13 +716,8 @@ void init_entity_runnable_average(struct sched_entity *se) { struct sched_avg *sa = &se->avg; - sa->last_update_time = 0; - /* - * sched_avg's period_contrib should be strictly less then 1024, so - * we give it 1023 to make sure it is almost a period (1024us), and - * will definitely be update (after enqueue). - */ - sa->period_contrib = 1023; + memset(sa, 0, sizeof(*sa)); + /* * Tasks are intialized with full load to be seen as heavy tasks until * they get a chance to stabilize to their real load level. @@ -731,13 +726,9 @@ void init_entity_runnable_average(struct sched_entity *se) */ if (entity_is_task(se)) sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); - sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX; - /* - * At this point, util_avg won't be used in select_task_rq_fair anyway - */ - sa->util_avg = 0; - sa->util_sum = 0; + se->runnable_weight = se->load.weight; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -785,7 +776,6 @@ void post_init_entity_util_avg(struct sched_entity *se) } else { sa->util_avg = cap; } - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } if (entity_is_task(se)) { @@ -3632,7 +3622,34 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + + /* + * When we attach the @se to the @cfs_rq, we must align the decay + * window because without that, really weird and wonderful things can + * happen. + * + * XXX illustrate + */ se->avg.last_update_time = cfs_rq->avg.last_update_time; + se->avg.period_contrib = cfs_rq->avg.period_contrib; + + /* + * Hell(o) Nasty stuff.. we need to recompute _sum based on the new + * period_contrib. This isn't strictly correct, but since we're + * entirely outside of the PELT hierarchy, nobody cares if we truncate + * _sum a little. + */ + se->avg.util_sum = se->avg.util_avg * divider; + + se->avg.load_sum = divider; + if (se_weight(se)) { + se->avg.load_sum = + div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); + } + + se->avg.runnable_load_sum = se->avg.load_sum; + enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; -- cgit v1.2.3 From 9a2dd585b2c431ec1e5d46a9d9568291c7a534cc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 May 2017 14:18:10 +0200 Subject: sched/fair: Implement more accurate async detach The problem with the overestimate is that it will subtract too big a value from the load_sum, thereby pushing it down further than it ought to go. Since runnable_load_avg is not subject to a similar 'force', this results in the occasional 'runnable_load > load' situation. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 954b332cd899..67c39642a512 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3574,6 +3574,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq->removed.nr) { unsigned long r; + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); @@ -3582,17 +3583,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); - /* - * The LOAD_AVG_MAX for _sum is a slight over-estimate, - * which is safe due to sub_positive() clipping at 0. - */ r = removed_load; sub_positive(&sa->load_avg, r); - sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); + sub_positive(&sa->load_sum, r * divider); r = removed_util; sub_positive(&sa->util_avg, r); - sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + sub_positive(&sa->util_sum, r * divider); add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); -- cgit v1.2.3 From 2c8e4dce7963d2bae02db95fce2691365630685c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Aug 2017 11:13:39 -0400 Subject: sched/fair: Calculate runnable_weight slightly differently Our runnable_weight currently looks like this runnable_weight = shares * runnable_load_avg / load_avg The goal is to scale the runnable weight for the group based on its runnable to load_avg ratio. The problem with this is it biases us towards tasks that never go to sleep. Tasks that go to sleep are going to have their runnable_load_avg decayed pretty hard, which will drastically reduce the runnable weight of groups with interactive tasks. To solve this imbalance we tweak this slightly, so in the ideal case it is still the above, but in the interactive case it is runnable_weight = shares * runnable_weight / load_weight which will make the weight distribution fairer between interactive and non-interactive groups. Signed-off-by: Josef Bacik Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: linux-kernel@vger.kernel.org Cc: riel@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1501773219-18774-2-git-send-email-jbacik@fb.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 67c39642a512..a62098ec9deb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2883,7 +2883,7 @@ void reweight_task(struct task_struct *p, int prio) * * hence icky! */ -static long calc_cfs_shares(struct cfs_rq *cfs_rq) +static long calc_group_shares(struct cfs_rq *cfs_rq) { long tg_weight, tg_shares, load, shares; struct task_group *tg = cfs_rq->tg; @@ -2920,6 +2920,36 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } + +/* + * The runnable shares of this group are calculated as such + * + * max(cfs_rq->avg.runnable_load_avg, cfs_rq->runnable_weight) + * shares * ------------------------------------------------------------ + * max(cfs_rq->avg.load_avg, cfs_rq->load.weight) + * + * We do this to keep the shares in line with expected load on the cfs_rq. + * Consider a cfs_rq that has several tasks wake up on this cfs_rq for the first + * time, it's runnable_load_avg is not going to be representative of the actual + * load this cfs_rq will now experience, which will bias us agaisnt this cfs_rq. + * The weight on the cfs_rq is the immediate effect of having new tasks + * enqueue'd onto it which should be used to calculate the new runnable shares. + * At the same time we need the actual load_avg to be the lower bounds for the + * calculation, to handle when our weight drops quickly from having entities + * dequeued. + */ +static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) +{ + long load_avg = max(cfs_rq->avg.load_avg, + scale_load_down(cfs_rq->load.weight)); + long runnable = max(cfs_rq->avg.runnable_load_avg, + scale_load_down(cfs_rq->runnable_weight)); + + runnable *= shares; + if (load_avg) + runnable /= load_avg; + return clamp_t(long, runnable, MIN_SHARES, shares); +} # endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -2945,17 +2975,8 @@ static void update_cfs_group(struct sched_entity *se) if (likely(se->load.weight == shares)) return; #else - shares = calc_cfs_shares(gcfs_rq); - /* - * The hierarchical runnable load metric is the proportional part - * of this group's runnable_load_avg / load_avg. - * - * Note: we need to deal with very sporadic 'runnable > load' cases - * due to numerical instability. - */ - runnable = shares * gcfs_rq->avg.runnable_load_avg; - if (runnable) - runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg); + shares = calc_group_shares(gcfs_rq); + runnable = calc_group_runnable(gcfs_rq, shares); #endif reweight_entity(cfs_rq_of(se), se, shares, runnable); -- cgit v1.2.3 From 17de4ee04ca925590df036b112c1db8a778e14bf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Aug 2017 13:06:35 +0200 Subject: sched/fair: Update calc_group_*() comments I had a wee bit of trouble recalling how the calc_group_runnable() stuff worked.. add hopefully better comments. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 66 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a62098ec9deb..350dbec01523 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2860,7 +2860,7 @@ void reweight_task(struct task_struct *p, int prio) * Now, in that special case (1) reduces to: * * tg->weight * grq->load.weight - * ge->load.weight = ----------------------------- = tg>weight (4) + * ge->load.weight = ----------------------------- = tg->weight (4) * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. @@ -2874,6 +2874,18 @@ void reweight_task(struct task_struct *p, int prio) * --------------------------------------------------- (5) * tg->load_avg - grq->avg.load_avg + grq->load.weight * + * But because grq->load.weight can drop to 0, resulting in a divide by zero, + * we need to use grq->avg.load_avg as its lower bound, which then gives: + * + * + * tg->weight * grq->load.weight + * ge->load.weight = ----------------------------- (6) + * tg_load_avg' + * + * Where: + * + * tg_load_avg' = tg->load_avg - grq->avg.load_avg + + * max(grq->load.weight, grq->avg.load_avg) * * And that is shares_weight and is icky. In the (near) UP case it approaches * (4) while in the normal case it approaches (3). It consistently @@ -2890,10 +2902,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) tg_shares = READ_ONCE(tg->shares); - /* - * Because (5) drops to 0 when the cfs_rq is idle, we need to use (3) - * as a lower bound. - */ load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); tg_weight = atomic_long_read(&tg->load_avg); @@ -2922,32 +2930,46 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) } /* - * The runnable shares of this group are calculated as such + * This calculates the effective runnable weight for a group entity based on + * the group entity weight calculated above. + * + * Because of the above approximation (2), our group entity weight is + * an load_avg based ratio (3). This means that it includes blocked load and + * does not represent the runnable weight. * - * max(cfs_rq->avg.runnable_load_avg, cfs_rq->runnable_weight) - * shares * ------------------------------------------------------------ - * max(cfs_rq->avg.load_avg, cfs_rq->load.weight) + * Approximate the group entity's runnable weight per ratio from the group + * runqueue: * - * We do this to keep the shares in line with expected load on the cfs_rq. - * Consider a cfs_rq that has several tasks wake up on this cfs_rq for the first - * time, it's runnable_load_avg is not going to be representative of the actual - * load this cfs_rq will now experience, which will bias us agaisnt this cfs_rq. - * The weight on the cfs_rq is the immediate effect of having new tasks - * enqueue'd onto it which should be used to calculate the new runnable shares. - * At the same time we need the actual load_avg to be the lower bounds for the - * calculation, to handle when our weight drops quickly from having entities - * dequeued. + * grq->avg.runnable_load_avg + * ge->runnable_weight = ge->load.weight * -------------------------- (7) + * grq->avg.load_avg + * + * However, analogous to above, since the avg numbers are slow, this leads to + * transients in the from-idle case. Instead we use: + * + * ge->runnable_weight = ge->load.weight * + * + * max(grq->avg.runnable_load_avg, grq->runnable_weight) + * ----------------------------------------------------- (8) + * max(grq->avg.load_avg, grq->load.weight) + * + * Where these max() serve both to use the 'instant' values to fix the slow + * from-idle and avoid the /0 on to-idle, similar to (6). */ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) { - long load_avg = max(cfs_rq->avg.load_avg, - scale_load_down(cfs_rq->load.weight)); - long runnable = max(cfs_rq->avg.runnable_load_avg, - scale_load_down(cfs_rq->runnable_weight)); + long runnable, load_avg; + + load_avg = max(cfs_rq->avg.load_avg, + scale_load_down(cfs_rq->load.weight)); + + runnable = max(cfs_rq->avg.runnable_load_avg, + scale_load_down(cfs_rq->runnable_weight)); runnable *= shares; if (load_avg) runnable /= load_avg; + return clamp_t(long, runnable, MIN_SHARES, shares); } # endif /* CONFIG_SMP */ -- cgit v1.2.3 From 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Sep 2017 13:43:15 -0400 Subject: fix infoleak in waitid(2) kernel_waitid() can return a PID, an error or 0. rusage is filled in the first case and waitid(2) rusage should've been copied out exactly in that case, *not* whenever kernel_waitid() has not returned an error. Compat variant shares that braino; none of kernel_wait4() callers do, so the below ought to fix it. Reported-and-tested-by: Alexander Potapenko Fixes: ce72a16fa705 ("wait4(2)/waitid(2): separate copying rusage to userland") Cc: stable@vger.kernel.org # v4.13 Signed-off-by: Al Viro --- kernel/exit.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 3481ababd06a..f2cd53e92147 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; + if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } @@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err && uru) { - /* kernel_waitid() overwrites everything in ru */ - if (COMPAT_USE_64BIT_TIME) - err = copy_to_user(uru, &ru, sizeof(ru)); - else - err = put_compat_rusage(&ru, uru); - if (err) - return -EFAULT; + if (uru) { + /* kernel_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + err = copy_to_user(uru, &ru, sizeof(ru)); + else + err = put_compat_rusage(&ru, uru); + if (err) + return -EFAULT; + } } if (!infop) -- cgit v1.2.3 From a1f7164c7b8b0d46f63bfb4ca0bb5971c760b921 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 09:00:18 -0700 Subject: sched: Misc preps for cgroup unified hierarchy interface Make the following changes in preparation for the cpu controller interface implementation for cgroup2. This patch doesn't cause any functional differences. * s/cpu_stats_show()/cpu_cfs_stat_show()/ * s/cpu_files/cpu_legacy_files/ v2: Dropped cpuacct changes as it won't be used by cpu controller interface anymore. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..6815fa424a7a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6599,7 +6599,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct seq_file *sf, void *v) +static int cpu_cfs_stat_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; @@ -6639,7 +6639,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ -static struct cftype cpu_files[] = { +static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", @@ -6660,7 +6660,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .seq_show = cpu_stats_show, + .seq_show = cpu_cfs_stat_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -6686,7 +6686,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_files, + .legacy_cftypes = cpu_legacy_files, .early_init = true, }; -- cgit v1.2.3 From 0d5936344f30aba0f6ddb92b030cb6a05168efe6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 09:00:19 -0700 Subject: sched: Implement interface for cgroup unified hierarchy There are a couple interface issues which can be addressed in cgroup2 interface. * Stats from cpuacct being reported separately from the cpu stats. * Use of different time units. Writable control knobs use microseconds, some stat fields use nanoseconds while other cpuacct stat fields use centiseconds. * Control knobs which can't be used in the root cgroup still show up in the root. * Control knob names and semantics aren't consistent with other controllers. This patchset implements cpu controller's interface on cgroup2 which adheres to the controller file conventions described in Documentation/cgroups/cgroup-v2.txt. Overall, the following changes are made. * cpuacct is implictly enabled and disabled by cpu and its information is reported through "cpu.stat" which now uses microseconds for all time durations. All time duration fields now have "_usec" appended to them for clarity. Note that cpuacct.usage_percpu is currently not included in "cpu.stat". If this information is actually called for, it will be added later. * "cpu.shares" is replaced with "cpu.weight" and operates on the standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). The weight is scaled to scheduler weight so that 100 maps to 1024 and the ratio relationship is preserved - if weight is W and its scaled value is S, W / 100 == S / 1024. While the mapped range is a bit smaller than the orignal scheduler weight range, the dead zones on both sides are relatively small and covers wider range than the nice value mappings. This file doesn't make sense in the root cgroup and isn't created on root. * "cpu.weight.nice" is added. When read, it reads back the nice value which is closest to the current "cpu.weight". When written, it sets "cpu.weight" to the weight value which matches the nice value. This makes it easy to configure cgroups when they're competing against threads in threaded subtrees. * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" which contains both quota and period. v4: - Use cgroup2 basic usage stat as the information source instead of cpuacct. v3: - Added "cpu.weight.nice" to allow using nice values when configuring the weight. The feature is requested by PeterZ. - Merge the patch to enable threaded support on cpu and cpuacct. - Dropped the bits about getting rid of cpuacct from patch description as there is a pretty strong case for making cpuacct an implicit controller so that basic cpu usage stats are always available. - Documentation updated accordingly. "cpu.rt.max" section is dropped for now. v2: - cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for CFS bandwidth stats and also using raw division for u64. Use CONFIG_CFS_BANDWITH and do_div() instead. "cpu.rt.max" is not included yet. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner --- kernel/sched/core.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6815fa424a7a..ad255162a830 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6678,6 +6678,175 @@ static struct cftype cpu_legacy_files[] = { { } /* Terminate */ }; +static int cpu_stat_show(struct seq_file *sf, void *v) +{ + cgroup_stat_show_cputime(sf, ""); + +#ifdef CONFIG_CFS_BANDWIDTH + { + struct task_group *tg = css_tg(seq_css(sf)); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 throttled_usec; + + throttled_usec = cfs_b->throttled_time; + do_div(throttled_usec, NSEC_PER_USEC); + + seq_printf(sf, "nr_periods %d\n" + "nr_throttled %d\n" + "throttled_usec %llu\n", + cfs_b->nr_periods, cfs_b->nr_throttled, + throttled_usec); + } +#endif + return 0; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + u64 weight = scale_load_down(tg->shares); + + return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); +} + +static int cpu_weight_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 weight) +{ + /* + * cgroup weight knobs should use the common MIN, DFL and MAX + * values which are 1, 100 and 10000 respectively. While it loses + * a bit of range on both ends, it maps pretty well onto the shares + * value used by scheduler and the round-trip conversions preserve + * the original value over the entire range. + */ + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + return -ERANGE; + + weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + + return sched_group_set_shares(css_tg(css), scale_load(weight)); +} + +static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + unsigned long weight = scale_load_down(css_tg(css)->shares); + int last_delta = INT_MAX; + int prio, delta; + + /* find the closest nice value to the current weight */ + for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) { + delta = abs(sched_prio_to_weight[prio] - weight); + if (delta >= last_delta) + break; + last_delta = delta; + } + + return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); +} + +static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 nice) +{ + unsigned long weight; + + if (nice < MIN_NICE || nice > MAX_NICE) + return -ERANGE; + + weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); +} +#endif + +static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, + long period, long quota) +{ + if (quota < 0) + seq_puts(sf, "max"); + else + seq_printf(sf, "%ld", quota); + + seq_printf(sf, " %ld\n", period); +} + +/* caller should put the current value in *@periodp before calling */ +static int __maybe_unused cpu_period_quota_parse(char *buf, + u64 *periodp, u64 *quotap) +{ + char tok[21]; /* U64_MAX */ + + if (!sscanf(buf, "%s %llu", tok, periodp)) + return -EINVAL; + + *periodp *= NSEC_PER_USEC; + + if (sscanf(tok, "%llu", quotap)) + *quotap *= NSEC_PER_USEC; + else if (!strcmp(tok, "max")) + *quotap = RUNTIME_INF; + else + return -EINVAL; + + return 0; +} + +#ifdef CONFIG_CFS_BANDWIDTH +static int cpu_max_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + return 0; +} + +static ssize_t cpu_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct task_group *tg = css_tg(of_css(of)); + u64 period = tg_get_cfs_period(tg); + u64 quota; + int ret; + + ret = cpu_period_quota_parse(buf, &period, "a); + if (!ret) + ret = tg_set_cfs_bandwidth(tg, period, quota); + return ret ?: nbytes; +} +#endif + +static struct cftype cpu_files[] = { + { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_stat_show, + }, +#ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_weight_read_u64, + .write_u64 = cpu_weight_write_u64, + }, + { + .name = "weight.nice", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_weight_nice_read_s64, + .write_s64 = cpu_weight_nice_write_s64, + }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_max_show, + .write = cpu_max_write, + }, +#endif + { } /* terminate */ +}; + struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, @@ -6687,7 +6856,9 @@ struct cgroup_subsys cpu_cgrp_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_legacy_files, + .dfl_cftypes = cpu_files, .early_init = true, + .threaded = true, }; #endif /* CONFIG_CGROUP_SCHED */ -- cgit v1.2.3 From 721e08dad17e226ef68819d0a23dc53c25fe8ea5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 29 Sep 2017 10:52:17 -0700 Subject: bpf: Fix compiler warning on info.map_ids for 32bit platform This patch uses u64_to_user_ptr() to cast info.map_ids to a userspace ptr. It also tags the user_map_ids with '__user' for sparse check. Fixes: cb4d2b3f03d8 ("bpf: Add name, load_time, uid and map_ids to bpf_prog_info") Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 11a7f82a55d1..b927da66f653 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1405,7 +1405,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); if (ulen) { - u32 *user_map_ids = (u32 *)info.map_ids; + u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); u32 i; for (i = 0; i < ulen; i++) -- cgit v1.2.3 From 7813dd6fc75fb375d4caf002e7f80a826fc3153a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 26 Sep 2017 15:12:40 -0700 Subject: PM / OPP: Move the OPP directory out of power/ The drivers/base/power/ directory is special and contains code related to power management core like system suspend/resume, hibernation, etc. It was fine to keep the OPP code inside it when we had just one file for it, but it is growing now and already has a directory for itself. Lets move it directly under drivers/ directory, just like cpufreq and cpuidle. Signed-off-by: Viresh Kumar Acked-by: Stephen Boyd Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e8517b63eb37..e880ca22c5a5 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -259,20 +259,6 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config PM_OPP - bool - select SRCU - ---help--- - SOCs have a standard set of tuples consisting of frequency and - voltage pairs that the device will support per voltage domain. This - is called Operating Performance Point or OPP. The actual definitions - of OPP varies over silicon within the same family of devices. - - OPP layer organizes the data internally using device pointers - representing individual voltage domains and provides SOC - implementations a ready to use framework to manage OPPs. - For more information, read - config PM_CLK def_bool y depends on PM && HAVE_CLK -- cgit v1.2.3 From 64ec72a1ece37d9bc7ba8b11d6091ce7cb1d8eec Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 27 Sep 2017 22:01:34 -0700 Subject: PM: Use a more common logging style Convert printks to pr_. Miscellanea: o Use pr_fmt with "PM:" and remove "PM: " from format strings o Coalesce format strings and realign format arguments o Convert an embedded incorrect function name to "%s: ", __func__ o Convert a couple multi-line formats to multiple pr_ calls Signed-off-by: Joe Perches Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 4 +- kernel/power/snapshot.c | 35 ++++++------- kernel/power/swap.c | 128 +++++++++++++++++++++--------------------------- 3 files changed, 77 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..9d7503910ce2 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -701,8 +701,8 @@ static int __init pm_qos_power_init(void) for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { ret = register_pm_qos_misc(pm_qos_array[i], d); if (ret < 0) { - printk(KERN_ERR "pm_qos_param: %s setup failed\n", - pm_qos_array[i]->name); + pr_err("%s: %s setup failed\n", + __func__, pm_qos_array[i]->name); return ret; } } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0972a8e09d08..a917a301e201 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) "PM: " fmt + #include #include #include @@ -967,7 +969,7 @@ void __init __register_nosave_region(unsigned long start_pfn, region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); Report: - printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", + pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n", (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); } @@ -1039,7 +1041,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) list_for_each_entry(region, &nosave_regions, list) { unsigned long pfn; - pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", + pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n", (unsigned long long) region->start_pfn << PAGE_SHIFT, ((unsigned long long) region->end_pfn << PAGE_SHIFT) - 1); @@ -1095,7 +1097,7 @@ int create_basic_memory_bitmaps(void) free_pages_map = bm2; mark_nosave_pages(forbidden_pages_map); - pr_debug("PM: Basic memory bitmaps created\n"); + pr_debug("Basic memory bitmaps created\n"); return 0; @@ -1131,7 +1133,7 @@ void free_basic_memory_bitmaps(void) memory_bm_free(bm2, PG_UNSAFE_CLEAR); kfree(bm2); - pr_debug("PM: Basic memory bitmaps freed\n"); + pr_debug("Basic memory bitmaps freed\n"); } void clear_free_pages(void) @@ -1152,7 +1154,7 @@ void clear_free_pages(void) pfn = memory_bm_next_pfn(bm); } memory_bm_position_reset(bm); - pr_info("PM: free pages cleared after restore\n"); + pr_info("free pages cleared after restore\n"); #endif /* PAGE_POISONING_ZERO */ } @@ -1690,7 +1692,7 @@ int hibernate_preallocate_memory(void) ktime_t start, stop; int error; - printk(KERN_INFO "PM: Preallocating image memory... "); + pr_info("Preallocating image memory... "); start = ktime_get(); error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); @@ -1821,13 +1823,13 @@ int hibernate_preallocate_memory(void) out: stop = ktime_get(); - printk(KERN_CONT "done (allocated %lu pages)\n", pages); + pr_cont("done (allocated %lu pages)\n", pages); swsusp_show_speed(start, stop, pages, "Allocated"); return 0; err_out: - printk(KERN_CONT "\n"); + pr_cont("\n"); swsusp_free(); return -ENOMEM; } @@ -1867,8 +1869,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) free += zone_page_state(zone, NR_FREE_PAGES); nr_pages += count_pages_for_highmem(nr_highmem); - pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, free); + pr_debug("Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); return free > nr_pages + PAGES_FOR_IO; } @@ -1961,20 +1963,20 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - printk(KERN_INFO "PM: Creating hibernation image:\n"); + pr_info("Creating hibernation image:\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); nr_highmem = count_highmem_pages(); - printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); + pr_info("Need to copy %u pages\n", nr_pages + nr_highmem); if (!enough_free_mem(nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Not enough free memory\n"); + pr_err("Not enough free memory\n"); return -ENOMEM; } if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Memory allocation failed\n"); + pr_err("Memory allocation failed\n"); return -ENOMEM; } @@ -1995,8 +1997,7 @@ asmlinkage __visible int swsusp_save(void) nr_copy_pages = nr_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n", - nr_pages); + pr_info("Hibernation image created (%d pages copied)\n", nr_pages); return 0; } @@ -2170,7 +2171,7 @@ static int check_header(struct swsusp_info *info) if (!reason && info->num_physpages != get_num_physpages()) reason = "memory size"; if (reason) { - printk(KERN_ERR "PM: Image mismatch: %s\n", reason); + pr_err("Image mismatch: %s\n", reason); return -EPERM; } return 0; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index d7cdc426ee38..293ead59eccc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,6 +12,8 @@ * */ +#define pr_fmt(fmt) "PM: " fmt + #include #include #include @@ -241,9 +243,9 @@ static void hib_end_io(struct bio *bio) struct page *page = bio->bi_io_vec[0].bv_page; if (bio->bi_status) { - printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); } if (bio_data_dir(bio) == WRITE) @@ -273,8 +275,8 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", - (unsigned long long)bio->bi_iter.bi_sector); + pr_err("Adding page to bio failed at %llu\n", + (unsigned long long)bio->bi_iter.bi_sector); bio_put(bio); return -EFAULT; } @@ -319,7 +321,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { - printk(KERN_ERR "PM: Swap header not found!\n"); + pr_err("Swap header not found!\n"); error = -ENODEV; } return error; @@ -413,8 +415,7 @@ static int get_swap_writer(struct swap_map_handle *handle) ret = swsusp_swap_check(); if (ret) { if (ret != -ENOSPC) - printk(KERN_ERR "PM: Cannot find swap device, try " - "swapon -a.\n"); + pr_err("Cannot find swap device, try swapon -a\n"); return ret; } handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); @@ -491,9 +492,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, { if (!error) { flush_swap_writer(handle); - printk(KERN_INFO "PM: S"); + pr_info("S"); error = mark_swapfiles(handle, flags); - printk("|\n"); + pr_cont("|\n"); } if (error) @@ -542,7 +543,7 @@ static int save_image(struct swap_map_handle *handle, hib_init_batch(&hb); - printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", + pr_info("Saving image data pages (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) @@ -557,8 +558,8 @@ static int save_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk(KERN_INFO "PM: Image saving progress: %3d%%\n", - nr_pages / m * 10); + pr_info("Image saving progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_io(&hb); @@ -566,7 +567,7 @@ static int save_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) - printk(KERN_INFO "PM: Image saving done.\n"); + pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); return ret; } @@ -692,14 +693,14 @@ static int save_image_lzo(struct swap_map_handle *handle, page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; goto out_clean; } data = vmalloc(sizeof(*data) * nr_threads); if (!data) { - printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; goto out_clean; } @@ -708,7 +709,7 @@ static int save_image_lzo(struct swap_map_handle *handle, crc = kmalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { - printk(KERN_ERR "PM: Failed to allocate crc\n"); + pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } @@ -726,8 +727,7 @@ static int save_image_lzo(struct swap_map_handle *handle, "image_compress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; - printk(KERN_ERR - "PM: Cannot start compression threads\n"); + pr_err("Cannot start compression threads\n"); ret = -ENOMEM; goto out_clean; } @@ -749,7 +749,7 @@ static int save_image_lzo(struct swap_map_handle *handle, crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; - printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } @@ -760,10 +760,9 @@ static int save_image_lzo(struct swap_map_handle *handle, */ handle->reqd_free_pages = reqd_free_pages(); - printk(KERN_INFO - "PM: Using %u thread(s) for compression.\n" - "PM: Compressing and saving image data (%u pages)...\n", - nr_threads, nr_to_write); + pr_info("Using %u thread(s) for compression\n", nr_threads); + pr_info("Compressing and saving image data (%u pages)...\n", + nr_to_write); m = nr_to_write / 10; if (!m) m = 1; @@ -783,10 +782,8 @@ static int save_image_lzo(struct swap_map_handle *handle, data_of(*snapshot), PAGE_SIZE); if (!(nr_pages % m)) - printk(KERN_INFO - "PM: Image saving progress: " - "%3d%%\n", - nr_pages / m * 10); + pr_info("Image saving progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } if (!off) @@ -813,15 +810,14 @@ static int save_image_lzo(struct swap_map_handle *handle, ret = data[thr].ret; if (ret < 0) { - printk(KERN_ERR "PM: LZO compression failed\n"); + pr_err("LZO compression failed\n"); goto out_finish; } if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > lzo1x_worst_compress(data[thr].unc_len))) { - printk(KERN_ERR - "PM: Invalid LZO compressed length\n"); + pr_err("Invalid LZO compressed length\n"); ret = -1; goto out_finish; } @@ -857,7 +853,7 @@ out_finish: if (!ret) ret = err2; if (!ret) - printk(KERN_INFO "PM: Image saving done.\n"); + pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); out_clean: if (crc) { @@ -888,7 +884,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags) unsigned int free_swap = count_swap_pages(root_swap, 1); unsigned int required; - pr_debug("PM: Free swap pages: %u\n", free_swap); + pr_debug("Free swap pages: %u\n", free_swap); required = PAGES_FOR_IO + nr_pages; return free_swap > required; @@ -915,12 +911,12 @@ int swsusp_write(unsigned int flags) pages = snapshot_get_image_size(); error = get_swap_writer(&handle); if (error) { - printk(KERN_ERR "PM: Cannot get swap writer\n"); + pr_err("Cannot get swap writer\n"); return error; } if (flags & SF_NOCOMPRESS_MODE) { if (!enough_swap(pages, flags)) { - printk(KERN_ERR "PM: Not enough free swap\n"); + pr_err("Not enough free swap\n"); error = -ENOSPC; goto out_finish; } @@ -1068,8 +1064,7 @@ static int load_image(struct swap_map_handle *handle, hib_init_batch(&hb); clean_pages_on_read = true; - printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", - nr_to_read); + pr_info("Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; @@ -1087,8 +1082,8 @@ static int load_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk(KERN_INFO "PM: Image loading progress: %3d%%\n", - nr_pages / m * 10); + pr_info("Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_io(&hb); @@ -1096,7 +1091,7 @@ static int load_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) { - printk(KERN_INFO "PM: Image loading done.\n"); + pr_info("Image loading done\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; @@ -1190,14 +1185,14 @@ static int load_image_lzo(struct swap_map_handle *handle, page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; goto out_clean; } data = vmalloc(sizeof(*data) * nr_threads); if (!data) { - printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; goto out_clean; } @@ -1206,7 +1201,7 @@ static int load_image_lzo(struct swap_map_handle *handle, crc = kmalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { - printk(KERN_ERR "PM: Failed to allocate crc\n"); + pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } @@ -1226,8 +1221,7 @@ static int load_image_lzo(struct swap_map_handle *handle, "image_decompress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; - printk(KERN_ERR - "PM: Cannot start decompression threads\n"); + pr_err("Cannot start decompression threads\n"); ret = -ENOMEM; goto out_clean; } @@ -1249,7 +1243,7 @@ static int load_image_lzo(struct swap_map_handle *handle, crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; - printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } @@ -1274,8 +1268,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!page[i]) { if (i < LZO_CMP_PAGES) { ring_size = i; - printk(KERN_ERR - "PM: Failed to allocate LZO pages\n"); + pr_err("Failed to allocate LZO pages\n"); ret = -ENOMEM; goto out_clean; } else { @@ -1285,10 +1278,9 @@ static int load_image_lzo(struct swap_map_handle *handle, } want = ring_size = i; - printk(KERN_INFO - "PM: Using %u thread(s) for decompression.\n" - "PM: Loading and decompressing image data (%u pages)...\n", - nr_threads, nr_to_read); + pr_info("Using %u thread(s) for decompression\n", nr_threads); + pr_info("Loading and decompressing image data (%u pages)...\n", + nr_to_read); m = nr_to_read / 10; if (!m) m = 1; @@ -1348,8 +1340,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { - printk(KERN_ERR - "PM: Invalid LZO compressed length\n"); + pr_err("Invalid LZO compressed length\n"); ret = -1; goto out_finish; } @@ -1400,16 +1391,14 @@ static int load_image_lzo(struct swap_map_handle *handle, ret = data[thr].ret; if (ret < 0) { - printk(KERN_ERR - "PM: LZO decompression failed\n"); + pr_err("LZO decompression failed\n"); goto out_finish; } if (unlikely(!data[thr].unc_len || data[thr].unc_len > LZO_UNC_SIZE || data[thr].unc_len & (PAGE_SIZE - 1))) { - printk(KERN_ERR - "PM: Invalid LZO uncompressed length\n"); + pr_err("Invalid LZO uncompressed length\n"); ret = -1; goto out_finish; } @@ -1420,10 +1409,8 @@ static int load_image_lzo(struct swap_map_handle *handle, data[thr].unc + off, PAGE_SIZE); if (!(nr_pages % m)) - printk(KERN_INFO - "PM: Image loading progress: " - "%3d%%\n", - nr_pages / m * 10); + pr_info("Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; ret = snapshot_write_next(snapshot); @@ -1448,15 +1435,14 @@ out_finish: } stop = ktime_get(); if (!ret) { - printk(KERN_INFO "PM: Image loading done.\n"); + pr_info("Image loading done\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; if (!ret) { if (swsusp_header->flags & SF_CRC32_MODE) { if(handle->crc32 != swsusp_header->crc32) { - printk(KERN_ERR - "PM: Invalid image CRC32!\n"); + pr_err("Invalid image CRC32!\n"); ret = -ENODATA; } } @@ -1513,9 +1499,9 @@ int swsusp_read(unsigned int *flags_p) swap_reader_finish(&handle); end: if (!error) - pr_debug("PM: Image successfully loaded\n"); + pr_debug("Image successfully loaded\n"); else - pr_debug("PM: Error %d resuming\n", error); + pr_debug("Error %d resuming\n", error); return error; } @@ -1552,13 +1538,13 @@ put: if (error) blkdev_put(hib_resume_bdev, FMODE_READ); else - pr_debug("PM: Image signature found, resuming\n"); + pr_debug("Image signature found, resuming\n"); } else { error = PTR_ERR(hib_resume_bdev); } if (error) - pr_debug("PM: Image not found (code %d)\n", error); + pr_debug("Image not found (code %d)\n", error); return error; } @@ -1570,7 +1556,7 @@ put: void swsusp_close(fmode_t mode) { if (IS_ERR(hib_resume_bdev)) { - pr_debug("PM: Image device not initialised\n"); + pr_debug("Image device not initialised\n"); return; } @@ -1594,7 +1580,7 @@ int swsusp_unmark(void) swsusp_resume_block, swsusp_header, NULL); } else { - printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); + pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; } -- cgit v1.2.3 From 2b0b8499ae75df91455bbeb7491d45affc384fb0 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 12 Sep 2017 10:14:54 +0800 Subject: ftrace: Fix kmemleak in unregister_ftrace_graph The trampoline allocated by function tracer was overwriten by function_graph tracer, and caused a memory leak. The save_global_trampoline should have saved the previous trampoline in register_ftrace_graph() and restored it in unregister_ftrace_graph(). But as it is implemented, save_global_trampoline was only used in unregister_ftrace_graph as default value 0, and it overwrote the previous trampoline's value. Causing the previous allocated trampoline to be lost. kmmeleak backtrace: kmemleak_vmalloc+0x77/0xc0 __vmalloc_node_range+0x1b5/0x2c0 module_alloc+0x7c/0xd0 arch_ftrace_update_trampoline+0xb5/0x290 ftrace_startup+0x78/0x210 register_ftrace_function+0x8b/0xd0 function_trace_init+0x4f/0x80 tracing_set_tracer+0xe6/0x170 tracing_set_trace_write+0x90/0xd0 __vfs_write+0x37/0x170 vfs_write+0xb2/0x1b0 SyS_write+0x55/0xc0 do_syscall_64+0x67/0x180 return_from_SYSCALL_64+0x0/0x6a [ Looking further into this, I found that this was left over from when the function and function graph tracers shared the same ftrace_ops. But in commit 5f151b2401 ("ftrace: Fix function_profiler and function tracer together"), the two were separated, and the save_global_trampoline no longer was necessary (and it may have been broken back then too). -- Steven Rostedt ] Link: http://lkml.kernel.org/r/20170912021454.5976-1-shuwang@redhat.com Cc: stable@vger.kernel.org Fixes: 5f151b2401 ("ftrace: Fix function_profiler and function tracer together") Signed-off-by: Shu Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6abfafd7f173..8319e09e15b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; - static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * Function graph does not allocate the trampoline, but - * other global_ops do. We need to reset the ALLOC_TRAMP flag - * if one was used. - */ - global_ops.trampoline = save_global_trampoline; - if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) - global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif - out: mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From f39b536ce9248e9799ff900358d6f073ab2e6c55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Sep 2017 20:19:02 -0700 Subject: rcu: Remove extraneous READ_ONCE()s from rcu_irq_{enter,exit}() The read of ->dynticks_nmi_nesting in rcu_irq_enter() and rcu_irq_exit() is currently protected with READ_ONCE(). However, this protection is unnecessary because (1) ->dynticks_nmi_nesting is updated only by the current CPU, (2) Although NMI handlers can update this field, they reset it back to its old value before return, and (3) Interrupts are disabled, so nothing else can modify it. The value of ->dynticks_nmi_nesting is thus effectively constant, and so no protection is required. This commit therefore removes the READ_ONCE() protection from these two accesses. Link: http://lkml.kernel.org/r/20170926031902.GA2074@linux.vnet.ibm.com Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 63bee8e1b193..c03152f7e458 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -890,7 +890,7 @@ void rcu_irq_exit(void) rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ - if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + if (rdtp->dynticks_nmi_nesting) return; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -1027,7 +1027,7 @@ void rcu_irq_enter(void) rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ - if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + if (rdtp->dynticks_nmi_nesting) return; oldval = rdtp->dynticks_nesting; -- cgit v1.2.3 From 90caccdd8cc0215705f18b92771b449b01e2474a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Oct 2017 15:37:20 -0700 Subject: bpf: fix bpf_tail_call() x64 JIT - bpf prog_array just like all other types of bpf array accepts 32-bit index. Clarify that in the comment. - fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes - tighten corresponding check in the interpreter to stay consistent The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and though JIT code is wrong it will check bounds correctly. Hence two fixes tags. All other JITs don't have this problem. Signed-off-by: Alexei Starovoitov Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation") Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..7b62df86be1d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1022,7 +1022,7 @@ select_insn: struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; - u64 index = BPF_R3; + u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; -- cgit v1.2.3 From 630cc2b30a42c70628368a412beb4a5e5dd71abe Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:14:18 -0700 Subject: kernel/params.c: align add_sysfs_param documentation with code This parameter is named kp, so the documentation should use that. Fixes: 9b473de87209 ("param: Fix duplicate module prefixes") Link: http://lkml.kernel.org/r/20170919142656.64aea59e@endymion Signed-off-by: Jean Delvare Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 60b2d8101355..1cd8f1a895a8 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -600,7 +600,7 @@ EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject - * @kparam: the actual parameter definition to add to sysfs + * @kp: the actual parameter definition to add to sysfs * @name: name of parameter * * Create a kobject if for a (per-module) parameter if mp NULL, and -- cgit v1.2.3 From a1b2289cef92ef0e9a92afcd2e1ea71d5bcaaf64 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Tue, 3 Oct 2017 16:15:00 -0700 Subject: android: binder: drop lru lock in isolate callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the global lru lock in isolate callback before calling zap_page_range which calls cond_resched, and re-acquire the global lru lock before returning. Also change return code to LRU_REMOVED_RETRY. Use mmput_async when fail to acquire mmap sem in an atomic context. Fix "BUG: sleeping function called from invalid context" errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled. Also restore mmput_async, which was initially introduced in commit ec8d7c14ea14 ("mm, oom_reaper: do not mmput synchronously from the oom reaper context"), and was removed in commit 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently"). Link: http://lkml.kernel.org/r/20170914182231.90908-1-sherryy@android.com Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder") Signed-off-by: Sherry Yang Signed-off-by: Greg Kroah-Hartman Reported-by: Kyle Yan Acked-by: Arve Hjønnevåg Acked-by: Michal Hocko Cc: Martijn Coenen Cc: Todd Kjos Cc: Riley Andrews Cc: Ingo Molnar Cc: Vlastimil Babka Cc: Hillf Danton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Hoeun Ryu Cc: Christopher Lameter Cc: Vegard Nossum Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 10646182440f..e702cb9ffbd8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +#ifdef CONFIG_MMU +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, + async_put_work); + + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); + } +} +#endif + /** * set_mm_exe_file - change a reference to the mm's executable file * -- cgit v1.2.3 From 3181c38e4df257852a0c0a53552fd5c869402886 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 3 Oct 2017 16:16:07 -0700 Subject: kernel/sysctl.c: remove duplicate UINT_MAX check on do_proc_douintvec_conv() do_proc_douintvec_conv() has two UINT_MAX checks, we can remove one. This has no functional changes other than fixing a compiler warning: kernel/sysctl.c:2190]: (warning) Identical condition '*lvalp>UINT_MAX', second condition is always false Fixes: 4f2fec00afa60 ("sysctl: simplify unsigned int support") Link: http://lkml.kernel.org/r/20170919072918.12066-1-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reported-by: David Binderman Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 423554ad3610..4da9e622471f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2186,8 +2186,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, int write, void *data) { if (write) { - if (*lvalp > UINT_MAX) - return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; -- cgit v1.2.3 From 1fdcce6e16c54facc4f0688630d3b9ecfcaa411f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:16:23 -0700 Subject: memremap: add scheduling point to devm_memremap_pages devm_memremap_pages is initializing struct pages in for_each_device_pfn and that can take quite some time. We have even seen a soft lockup triggering on a non preemptive kernel NMI watchdog: BUG: soft lockup - CPU#61 stuck for 22s! [kworker/u641:11:1808] [...] RIP: 0010:[] [] devm_memremap_pages+0x327/0x430 [...] Call Trace: pmem_attach_disk+0x2fd/0x3f0 [nd_pmem] nvdimm_bus_probe+0x64/0x110 [libnvdimm] driver_probe_device+0x1f7/0x420 bus_for_each_drv+0x52/0x80 __device_attach+0xb0/0x130 bus_probe_device+0x87/0xa0 device_add+0x3fc/0x5f0 nd_async_device_register+0xe/0x40 [libnvdimm] async_run_entry_fn+0x43/0x150 process_one_work+0x14e/0x410 worker_thread+0x116/0x490 kthread+0xc7/0xe0 ret_from_fork+0x3f/0x70 fix this by adding cond_resched every 1024 pages. Link: http://lkml.kernel.org/r/20170918121410.24466-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 6bcbfbf1a8fd..403ab9cdb949 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; - int error, nid, is_ram; + int error, nid, is_ram, i = 0; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, list_del(&page->lru); page->pgmap = pgmap; percpu_ref_get(ref); + if (!(++i % 1024)) + cond_resched(); } devres_add(dev, page_map); return __va(res->start); -- cgit v1.2.3 From c9653850c90d6050197bc76ba672e00a7771aea5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 3 Oct 2017 16:16:26 -0700 Subject: kernel/kcmp.c: drop branch leftover typo The else branch been left over and escaped the source code refresh. Not a problem but better clean it up. Fixes: 0791e3644e5e ("kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files") Link: http://lkml.kernel.org/r/20170917165838.GA1887@uranus.lan Reported-by: Eugene Syromiatnikov Signed-off-by: Cyrill Gorcunov Acked-by: Andrei Vagin Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcmp.c b/kernel/kcmp.c index ea34ed8bb952..055bb2962a0b 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1, if (filp_epoll) { filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); fput(filp_epoll); - } else + } if (IS_ERR(filp_tgt)) return PTR_ERR(filp_tgt); -- cgit v1.2.3 From 90ceb2a3ad868f800eb1c9f4ede650daddd94b77 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:35 -0700 Subject: kernel/params.c: fix the maximum length in param_get_string The length parameter of strlcpy() is supposed to reflect the size of the target buffer, not of the source string. Harmless in this case as the buffer is PAGE_SIZE long and the source string is always much shorter than this, but conceptually wrong, so let's fix it. Link: http://lkml.kernel.org/r/20170928162515.24846b4f@endymion Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 1cd8f1a895a8..8283ba045f4f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -507,7 +507,7 @@ EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, kps->maxlen); + return strlcpy(buffer, kps->string, PAGE_SIZE); } EXPORT_SYMBOL(param_get_string); -- cgit v1.2.3 From 96802e6b1dbf29d3012b39503c5dd6d9d8e82955 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:38 -0700 Subject: kernel/params.c: fix an overflow in param_attr_show Function param_attr_show could overflow the buffer it is operating on. The buffer size is PAGE_SIZE, and the string returned by attribute->param->ops->get is generated by scnprintf(buffer, PAGE_SIZE, ...) so it could be PAGE_SIZE - 1 long, with the terminating '\0' at the very end of the buffer. Calling strcat(..., "\n") on this isn't safe, as the '\0' will be replaced by '\n' (OK) and then another '\0' will be added past the end of the buffer (not OK.) Simply add the trailing '\n' when writing the attribute contents to the buffer originally. This is safe, and also faster. Credits to Teradata for discovering this issue. Link: http://lkml.kernel.org/r/20170928162602.60c379c7@endymion Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 8283ba045f4f..0cca488263dd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -224,7 +224,7 @@ char *parse_args(const char *doing, } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ - return scnprintf(buffer, PAGE_SIZE, format, \ + return scnprintf(buffer, PAGE_SIZE, format "\n", \ *((type *)kp->arg)); \ } \ const struct kernel_param_ops param_ops_##name = { \ @@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { - return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); + return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); @@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); + return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); @@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool); int param_get_invbool(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); + return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y'); } EXPORT_SYMBOL(param_get_invbool); @@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp) struct kernel_param p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { + /* Replace \n with comma */ if (i) - buffer[off++] = ','; + buffer[off - 1] = ','; p.arg = arr->elem + arr->elemsize * i; check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); @@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, PAGE_SIZE); + return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string); } EXPORT_SYMBOL(param_get_string); @@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr, kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); kernel_param_unlock(mk->mod); - if (count > 0) { - strcat(buf, "\n"); - ++count; - } return count; } -- cgit v1.2.3 From e0596c80f442d1e1221c17dbb71b2aed43909221 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:41 -0700 Subject: kernel/params.c: improve STANDARD_PARAM_DEF readability Align the parameters passed to STANDARD_PARAM_DEF for clarity. Link: http://lkml.kernel.org/r/20170928162728.756143cc@endymion Signed-off-by: Jean Delvare Suggested-by: Ingo Molnar Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 0cca488263dd..cc9108c2a1fd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -236,14 +236,14 @@ char *parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { -- cgit v1.2.3 From 6b9dc4806b28214a4a260517e59439e0ac12a15e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Oct 2017 12:34:50 +0200 Subject: watchdog/core, powerpc: Replace watchdog_nmi_reconfigure() The recent cleanup of the watchdog code split watchdog_nmi_reconfigure() into two stages. One to stop the NMI and one to restart it after reconfiguration. That was done by adding a boolean 'run' argument to the code, which is functionally correct but not necessarily a piece of art. Replace it by two explicit functions: watchdog_nmi_stop() and watchdog_nmi_start(). Fixes: 6592ad2fcc8f ("watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage") Requested-by: Linus 'Nursing his pet-peeve' Torvalds Signed-off-by: Thomas 'Mopping up garbage' Gleixner Acked-by: Michael Ellerman Cc: Peter Zijlstra Cc: Don Zickus Cc: Benjamin Herrenschmidt Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710021957480.2114@nanos --- kernel/watchdog.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f6ef163b72cd..6ad6226535d0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -123,24 +123,27 @@ int __weak __init watchdog_nmi_probe(void) } /** - * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs - * @run: If false stop the watchdogs on all enabled CPUs - * If true start the watchdogs on all enabled CPUs + * watchdog_nmi_stop - Stop the watchdog for reconfiguration * - * The core call order is: - * watchdog_nmi_reconfigure(false); + * The reconfiguration steps are: + * watchdog_nmi_stop(); * update_variables(); - * watchdog_nmi_reconfigure(true); + * watchdog_nmi_start(); + */ +void __weak watchdog_nmi_stop(void) { } + +/** + * watchdog_nmi_start - Start the watchdog after reconfiguration * - * The second call which starts the watchdogs again guarantees that the - * following variables are stable across the call. + * Counterpart to watchdog_nmi_stop(). + * + * The following variables have been updated in update_variables() and + * contain the currently valid configuration: * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask - * - * After the call the variables can be changed again. */ -void __weak watchdog_nmi_reconfigure(bool run) { } +void __weak watchdog_nmi_start(void) { } /** * lockup_detector_update_enable - Update the sysctl enable bit @@ -551,13 +554,13 @@ static void softlockup_unpark_threads(void) static void softlockup_reconfigure_threads(void) { - watchdog_nmi_reconfigure(false); + watchdog_nmi_stop(); softlockup_park_all_threads(); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) softlockup_unpark_threads(); - watchdog_nmi_reconfigure(true); + watchdog_nmi_start(); } /* @@ -602,9 +605,9 @@ static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } static void softlockup_reconfigure_threads(void) { - watchdog_nmi_reconfigure(false); + watchdog_nmi_stop(); lockup_detector_update_enable(); - watchdog_nmi_reconfigure(true); + watchdog_nmi_start(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -- cgit v1.2.3 From e31d6883f21c1cdfe5bc64e28411f8a92b783fde Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Oct 2017 16:37:53 +0200 Subject: watchdog/core, powerpc: Lock cpus across reconfiguration Instead of dropping the cpu hotplug lock after stopping NMI watchdog and threads and reaquiring for restart, the code and the protection rules become more obvious when holding cpu hotplug lock across the full reconfiguration. Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Acked-by: Michael Ellerman Cc: Peter Zijlstra Cc: Don Zickus Cc: Benjamin Herrenschmidt Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710022105570.2114@nanos --- kernel/smpboot.c | 3 +-- kernel/watchdog.c | 10 +++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index ed7507b69b48..5043e7433f4b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -351,7 +351,7 @@ void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread static struct cpumask tmp; unsigned int cpu; - get_online_cpus(); + lockdep_assert_cpus_held(); mutex_lock(&smpboot_threads_lock); /* Park threads that were exclusively enabled on the old mask. */ @@ -367,7 +367,6 @@ void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread cpumask_copy(old, new); mutex_unlock(&smpboot_threads_lock); - put_online_cpus(); } static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6ad6226535d0..fff90fe10007 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -535,7 +535,6 @@ static void softlockup_update_smpboot_threads(void) smpboot_update_cpumask_percpu_thread(&watchdog_threads, &watchdog_allowed_mask); - __lockup_detector_cleanup(); } /* Temporarily park all watchdog threads */ @@ -554,6 +553,7 @@ static void softlockup_unpark_threads(void) static void softlockup_reconfigure_threads(void) { + cpus_read_lock(); watchdog_nmi_stop(); softlockup_park_all_threads(); set_sample_period(); @@ -561,6 +561,12 @@ static void softlockup_reconfigure_threads(void) if (watchdog_enabled && watchdog_thresh) softlockup_unpark_threads(); watchdog_nmi_start(); + cpus_read_unlock(); + /* + * Must be called outside the cpus locked section to prevent + * recursive locking in the perf code. + */ + __lockup_detector_cleanup(); } /* @@ -605,9 +611,11 @@ static inline void watchdog_disable_all_cpus(void) { } static inline void softlockup_init_threads(void) { } static void softlockup_reconfigure_threads(void) { + cpus_read_lock(); watchdog_nmi_stop(); lockup_detector_update_enable(); watchdog_nmi_start(); + cpus_read_unlock(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -- cgit v1.2.3 From 34ddaa3e5c0096fef52485186c7eb6cf56ddc686 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Oct 2017 16:39:02 +0200 Subject: powerpc/watchdog: Make use of watchdog_nmi_probe() The rework of the core hotplug code triggers the WARN_ON in start_wd_cpu() on powerpc because it is called multiple times for the boot CPU. The first call is via: start_wd_on_cpu+0x80/0x2f0 watchdog_nmi_reconfigure+0x124/0x170 softlockup_reconfigure_threads+0x110/0x130 lockup_detector_init+0xbc/0xe0 kernel_init_freeable+0x18c/0x37c kernel_init+0x2c/0x160 ret_from_kernel_thread+0x5c/0xbc And then again via the CPU hotplug registration: start_wd_on_cpu+0x80/0x2f0 cpuhp_invoke_callback+0x194/0x620 cpuhp_thread_fun+0x7c/0x1b0 smpboot_thread_fn+0x290/0x2a0 kthread+0x168/0x1b0 ret_from_kernel_thread+0x5c/0xbc This can be avoided by setting up the cpu hotplug state with nocalls and move the initialization to the watchdog_nmi_probe() function. That initializes the hotplug callbacks without invoking the callback and the following core initialization function then configures the watchdog for the online CPUs (in this case CPU0) via softlockup_reconfigure_threads(). Reported-and-tested-by: Michael Ellerman Signed-off-by: Thomas Gleixner Acked-by: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Nicholas Piggin Cc: linuxppc-dev@lists.ozlabs.org --- kernel/watchdog.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fff90fe10007..5c6fb7cd9ae8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -608,7 +608,6 @@ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } -static inline void softlockup_init_threads(void) { } static void softlockup_reconfigure_threads(void) { cpus_read_lock(); @@ -617,6 +616,10 @@ static void softlockup_reconfigure_threads(void) watchdog_nmi_start(); cpus_read_unlock(); } +static inline void softlockup_init_threads(void) +{ + softlockup_reconfigure_threads(); +} #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_cleanup(void) -- cgit v1.2.3 From 5587185ddb4b9f413299dfec0a022ad0212513e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Oct 2017 10:03:04 +0200 Subject: watchdog/core: Rename some softlockup_* functions The function names made sense up to the point where the watchdog (re)configuration was unified to use softlockup_reconfigure_threads() for all configuration purposes. But that includes scenarios which solely configure the nmi watchdog. Rename softlockup_reconfigure_threads() and softlockup_init_threads() so the function names match the functionality. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Don Zickus --- kernel/watchdog.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5c6fb7cd9ae8..d241bd99cee1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -551,7 +551,7 @@ static void softlockup_unpark_threads(void) softlockup_update_smpboot_threads(); } -static void softlockup_reconfigure_threads(void) +static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -570,13 +570,13 @@ static void softlockup_reconfigure_threads(void) } /* - * Create the watchdog thread infrastructure. + * Create the watchdog thread infrastructure and configure the detector(s). * * The threads are not unparked as watchdog_allowed_mask is empty. When * the threads are sucessfully initialized, take the proper locks and * unpark the threads in the watchdog_cpumask if the watchdog is enabled. */ -static __init void softlockup_init_threads(void) +static __init void lockup_detector_setup(void) { int ret; @@ -599,7 +599,7 @@ static __init void softlockup_init_threads(void) mutex_lock(&watchdog_mutex); softlockup_threads_initialized = true; - softlockup_reconfigure_threads(); + lockup_detector_reconfigure(); mutex_unlock(&watchdog_mutex); } @@ -608,7 +608,7 @@ static inline int watchdog_park_threads(void) { return 0; } static inline void watchdog_unpark_threads(void) { } static inline int watchdog_enable_all_cpus(void) { return 0; } static inline void watchdog_disable_all_cpus(void) { } -static void softlockup_reconfigure_threads(void) +static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -616,9 +616,9 @@ static void softlockup_reconfigure_threads(void) watchdog_nmi_start(); cpus_read_unlock(); } -static inline void softlockup_init_threads(void) +static inline void lockup_detector_setup(void) { - softlockup_reconfigure_threads(); + lockup_detector_reconfigure(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -658,7 +658,7 @@ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - softlockup_reconfigure_threads(); + lockup_detector_reconfigure(); } /* @@ -785,5 +785,5 @@ void __init lockup_detector_init(void) if (!watchdog_nmi_probe()) nmi_watchdog_available = true; - softlockup_init_threads(); + lockup_detector_setup(); } -- cgit v1.2.3 From 0b62bf862dc93a05fea97b6ca6ffca072e2f30c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Oct 2017 20:59:09 +0200 Subject: watchdog/core: Put softlockup_threads_initialized under ifdef guard The variable is unused when the softlockup detector is disabled in Kconfig. Signed-off-by: Thomas Gleixner --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d241bd99cee1..6bcb854909c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -47,7 +47,6 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly nmi_watchdog_available; struct cpumask watchdog_allowed_mask __read_mostly; -static bool softlockup_threads_initialized __read_mostly; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -168,6 +167,7 @@ static void lockup_detector_update_enable(void) unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +static bool softlockup_threads_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -- cgit v1.2.3 From d8c4deee6dc6876ae3b7d09a5b58138a57ee45f6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 8 Sep 2017 23:55:17 -0700 Subject: tracing: Remove obsolete sched_switch tracer selftest Since commit 87d80de2800d087ea833cb79bc13f85ff34ed49f ("tracing: Remove obsolete sched_switch tracer"), the sched_switch tracer selftest is no longer used. This patch removes the same. Link: http://lkml.kernel.org/r/20170909065517.22262-1-joelaf@google.com Cc: Ingo Molnar Cc: kernel-team@android.com Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 -- kernel/trace/trace_selftest.c | 32 -------------------------------- 2 files changed, 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 652c682707cd..3c078e2ad777 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -738,8 +738,6 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_sched_switch(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); /* diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b17ec642793b..364f78abdf47 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1150,38 +1150,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_SCHED_TRACER */ -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -int -trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(&tr->trace_buffer, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - #ifdef CONFIG_BRANCH_TRACER int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) -- cgit v1.2.3 From 6e7a2398114a2597a0995f96f44d908741ae5035 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 23 Aug 2017 12:23:09 +0100 Subject: tracing: Remove redundant unread variable ret Integer ret is being assigned but never used and hence it is redundant. Remove it, fixes clang warning: trace_events_hist.c:1077:3: warning: Value stored to 'ret' is never read Link: http://lkml.kernel.org/r/20170823112309.19383-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1c21d0e2a145..f123b5d0c226 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1062,7 +1062,7 @@ static void hist_trigger_show(struct seq_file *m, struct event_trigger_data *data, int n) { struct hist_trigger_data *hist_data; - int n_entries, ret = 0; + int n_entries; if (n > 0) seq_puts(m, "\n\n"); @@ -1073,10 +1073,8 @@ static void hist_trigger_show(struct seq_file *m, hist_data = data->private_data; n_entries = print_entries(m, hist_data); - if (n_entries < 0) { - ret = n_entries; + if (n_entries < 0) n_entries = 0; - } seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", (u64)atomic64_read(&hist_data->map->hits), -- cgit v1.2.3 From 12ecef0cb12102d8c034770173d2d1363cb97d52 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Sep 2017 16:22:49 -0400 Subject: tracing: Reverse the order of trace_types_lock and event_mutex In order to make future changes where we need to call tracing_set_clock() from within an event command, the order of trace_types_lock and event_mutex must be reversed, as the event command will hold event_mutex and the trace_types_lock is taken from within tracing_set_clock(). Link: http://lkml.kernel.org/r/20170921162249.0dde3dca@gandalf.local.home Requested-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 +++++ kernel/trace/trace_events.c | 31 +++++++++++++++---------------- 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 752e5daf0896..5f1ac7d3402c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7687,6 +7687,7 @@ static int instance_mkdir(const char *name) struct trace_array *tr; int ret; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = -EEXIST; @@ -7742,6 +7743,7 @@ static int instance_mkdir(const char *name) list_add(&tr->list, &ftrace_trace_arrays); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return 0; @@ -7753,6 +7755,7 @@ static int instance_mkdir(const char *name) out_unlock: mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return ret; @@ -7765,6 +7768,7 @@ static int instance_rmdir(const char *name) int ret; int i; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = -ENODEV; @@ -7810,6 +7814,7 @@ static int instance_rmdir(const char *name) out_unlock: mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return ret; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 87468398b9ed..ec0f9aa4e151 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1406,8 +1406,8 @@ static int subsystem_open(struct inode *inode, struct file *filp) return -ENODEV; /* Make sure the system still exists */ - mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { list_for_each_entry(dir, &tr->systems, list) { if (dir == inode->i_private) { @@ -1421,8 +1421,8 @@ static int subsystem_open(struct inode *inode, struct file *filp) } } exit_loop: - mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); if (!system) return -ENODEV; @@ -2294,15 +2294,15 @@ static void __add_event_to_tracers(struct trace_event_call *call); int trace_add_event_call(struct trace_event_call *call) { int ret; - mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); ret = __register_event(call, NULL); if (ret >= 0) __add_event_to_tracers(call); - mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return ret; } @@ -2356,13 +2356,13 @@ int trace_remove_event_call(struct trace_event_call *call) { int ret; - mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); down_write(&trace_event_sem); ret = probe_remove_event_call(call); up_write(&trace_event_sem); - mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return ret; } @@ -2424,8 +2424,8 @@ static int trace_module_notify(struct notifier_block *self, { struct module *mod = data; - mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); switch (val) { case MODULE_STATE_COMING: trace_module_add_events(mod); @@ -2434,8 +2434,8 @@ static int trace_module_notify(struct notifier_block *self, trace_module_remove_events(mod); break; } - mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return 0; } @@ -2950,24 +2950,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) * creates the event hierachry in the @parent/events directory. * * Returns 0 on success. + * + * Must be called with event_mutex held. */ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) - goto out_unlock; + goto out; down_write(&trace_event_sem); __trace_add_event_dirs(tr); up_write(&trace_event_sem); - out_unlock: - mutex_unlock(&event_mutex); - + out: return ret; } @@ -2996,9 +2996,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) return ret; } +/* Must be called with event_mutex held */ int event_trace_del_tracer(struct trace_array *tr) { - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); /* Disable any event triggers and associated soft-disabled events */ clear_event_triggers(tr); @@ -3019,8 +3020,6 @@ int event_trace_del_tracer(struct trace_array *tr) tr->event_dir = NULL; - mutex_unlock(&event_mutex); - return 0; } -- cgit v1.2.3 From 1a149d7d3f45d311da1f63473736c05f30ae8a75 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 16:59:02 -0400 Subject: ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler The current method to prevent the ring buffer from entering into a recursize loop is to use a bitmask and set the bit that maps to the current context (normal, softirq, irq or NMI), and if that bit was already set, it is considered a recursive loop. New code is being added that may require the ring buffer to be entered a second time in the current context. The recursive locking prevents that from happening. Instead of mapping a bitmask to the current context, just allow 4 levels of nesting in the ring buffer. This matches the 4 context levels that it can already nest. It is highly unlikely to have more than two levels, thus it should be fine when we add the second entry into the ring buffer. If that proves to be a problem, we can always up the number to 8. An added benefit is that reading preempt_count() to get the current level adds a very slight but noticeable overhead. This removes that need. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 64 ++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 81279c6602ff..f6ee9b1ef62a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2538,61 +2538,29 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. To pass this - * information from the lock to the unlock without having to - * access the 'in_interrupt()' functions again (which do show - * a bit of overhead in something as critical as function tracing, - * we use a bitmask trick. + * be modified more than once via an interrupt. There are four + * different contexts that we need to consider. * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. + * Normal context. + * SoftIRQ context + * IRQ context + * NMI context * - * This works because this is the order of contexts that can - * preempt other contexts. A SoftIRQ never preempts an IRQ - * context. - * - * When the context is determined, the corresponding bit is - * checked and set (if it was set, then a recursion of that context - * happened). - * - * On unlock, we need to clear this bit. To do so, just subtract - * 1 from the current_context and AND it to itself. - * - * (binary) - * 101 - 1 = 100 - * 101 & 100 = 100 (clearing bit zero) - * - * 1010 - 1 = 1001 - * 1010 & 1001 = 1000 (clearing bit 1) - * - * The least significant bit can be cleared this way, and it - * just so happens that it is the same bit corresponding to - * the current context. + * If for some reason the ring buffer starts to recurse, we + * only allow that to happen at most 4 times (one for each + * context). If it happens 5 times, then we consider this a + * recusive loop and do not let it go further. */ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned int val = cpu_buffer->current_context; - int bit; - - if (in_interrupt()) { - if (in_nmi()) - bit = RB_CTX_NMI; - else if (in_irq()) - bit = RB_CTX_IRQ; - else - bit = RB_CTX_SOFTIRQ; - } else - bit = RB_CTX_NORMAL; - - if (unlikely(val & (1 << bit))) + if (cpu_buffer->current_context >= 4) return 1; - val |= (1 << bit); - cpu_buffer->current_context = val; + cpu_buffer->current_context++; + /* Interrupts must see this update */ + barrier(); return 0; } @@ -2600,7 +2568,9 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->current_context &= cpu_buffer->current_context - 1; + /* Don't let the dec leak out */ + barrier(); + cpu_buffer->current_context--; } /** -- cgit v1.2.3 From a15f7fc20389a8827d5859907568b201234d4b79 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:17 -0500 Subject: tracing: Exclude 'generic fields' from histograms There are a small number of 'generic fields' (comm/COMM/cpu/CPU) that are found by trace_find_event_field() but are only meant for filtering. Specifically, they unlike normal fields, they have a size of 0 and thus wreak havoc when used as a histogram key. Exclude these (return -EINVAL) when used as histogram keys. Link: http://lkml.kernel.org/r/956154cbc3e8a4f0633d619b886c97f0f0edf7b4.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f123b5d0c226..121d56850f24 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -450,7 +450,7 @@ static int create_val_field(struct hist_trigger_data *hist_data, } field = trace_find_event_field(file->event_call, field_name); - if (!field) { + if (!field || !field->size) { ret = -EINVAL; goto out; } @@ -548,7 +548,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } field = trace_find_event_field(file->event_call, field_name); - if (!field) { + if (!field || !field->size) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From 83c07ecc4203728e85fc4a2ce6fdf25d16ea118e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:18 -0500 Subject: tracing: Remove lookups from tracing_map hitcount Lookups inflate the hitcount, making it essentially useless. Only inserts and updates should really affect the hitcount anyway, so explicitly filter lookups out. Link: http://lkml.kernel.org/r/c8d9dc39d269a8abf88bf4102d0dfc65deb0fc7f.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 305039b122fa..07e75344725b 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -428,7 +428,8 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) if (test_key && test_key == key_hash && entry->val && keys_match(key, entry->val->key, map->key_size)) { - atomic64_inc(&map->hits); + if (!lookup_only) + atomic64_inc(&map->hits); return entry->val; } -- cgit v1.2.3 From 4f36c2d85cedea60ad424d44534121ab0458069e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:19 -0500 Subject: tracing: Increase tracing map KEYS_MAX size The current default for the number of subkeys in a compound key is 2, which is too restrictive. Increase it to a more realistic value of 3. Link: http://lkml.kernel.org/r/b6952cca06d1f912eba33804a6fd6069b3847d44.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 618838f5f30a..f0975110b967 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -5,7 +5,7 @@ #define TRACING_MAP_BITS_MAX 17 #define TRACING_MAP_BITS_MIN 7 -#define TRACING_MAP_KEYS_MAX 2 +#define TRACING_MAP_KEYS_MAX 3 #define TRACING_MAP_VALS_MAX 3 #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ TRACING_MAP_VALS_MAX) -- cgit v1.2.3 From 7e465baa80293ed5f87fdf6405391d6f02110d4e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:20 -0500 Subject: tracing: Make traceprobe parsing code reusable traceprobe_probes_write() and traceprobe_command() actually contain nothing that ties them to kprobes - the code is generically useful for similar types of parsing elsewhere, so separate it out and move it to trace.c/trace.h. Other than moving it, the only change is in naming: traceprobe_probes_write() becomes trace_parse_run_command() and traceprobe_command() becomes trace_run_command(). Link: http://lkml.kernel.org/r/ae5c26ea40c196a8986854d921eb6e713ede7e3f.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 86 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 7 ++++ kernel/trace/trace_kprobe.c | 18 +++++----- kernel/trace/trace_probe.c | 86 --------------------------------------------- kernel/trace/trace_probe.h | 7 ---- kernel/trace/trace_uprobe.c | 2 +- 6 files changed, 103 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5f1ac7d3402c..73e67b68c53b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8281,6 +8281,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) } EXPORT_SYMBOL_GPL(ftrace_dump); +int trace_run_command(const char *buf, int (*createfn)(int, char **)) +{ + char **argv; + int argc, ret; + + argc = 0; + ret = 0; + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = createfn(argc, argv); + + argv_free(argv); + + return ret; +} + +#define WRITE_BUFSIZE 4096 + +ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos, + int (*createfn)(int, char **)) +{ + char *kbuf, *buf, *tmp; + int ret = 0; + size_t done = 0; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + while (done < count) { + size = count - done; + + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + buf = kbuf; + do { + tmp = strchr(buf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - buf + 1; + } else { + size = strlen(buf); + if (done + size < count) { + if (buf != kbuf) + break; + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE - 2); + ret = -EINVAL; + goto out; + } + } + done += size; + + /* Remove comments */ + tmp = strchr(buf, '#'); + + if (tmp) + *tmp = '\0'; + + ret = trace_run_command(buf, createfn); + if (ret) + goto out; + buf += size; + + } while (done < count); + } + ret = done; + +out: + kfree(kbuf); + + return ret; +} + __init static int tracer_alloc_buffers(void) { int ring_buf_size; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3c078e2ad777..f8343eb3c1f9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1752,6 +1752,13 @@ void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); +#define MAX_EVENT_NAME_LEN 64 + +extern int trace_run_command(const char *buf, int (*createfn)(int, char**)); +extern ssize_t trace_parse_run_command(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos, + int (*createfn)(int, char**)); + /* * Normal trace_printk() and friends allocates special buffers * to do the manipulation, as well as saves the print formats diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8a907e12b6b9..af6134f2e597 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -907,8 +907,8 @@ static int probes_open(struct inode *inode, struct file *file) static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - return traceprobe_probes_write(file, buffer, count, ppos, - create_trace_kprobe); + return trace_parse_run_command(file, buffer, count, ppos, + create_trace_kprobe); } static const struct file_operations kprobe_events_ops = { @@ -1433,9 +1433,9 @@ static __init int kprobe_trace_self_tests_init(void) pr_info("Testing kprobe tracing: "); - ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " - "$stack $stack0 +0($stack)", - create_trace_kprobe); + ret = trace_run_command("p:testprobe kprobe_trace_selftest_target " + "$stack $stack0 +0($stack)", + create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function entry.\n"); warn++; @@ -1455,8 +1455,8 @@ static __init int kprobe_trace_self_tests_init(void) } } - ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " - "$retval", create_trace_kprobe); + ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target " + "$retval", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function return.\n"); warn++; @@ -1526,13 +1526,13 @@ static __init int kprobe_trace_self_tests_init(void) disable_trace_kprobe(tk, file); } - ret = traceprobe_command("-:testprobe", create_trace_kprobe); + ret = trace_run_command("-:testprobe", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } - ret = traceprobe_command("-:testprobe2", create_trace_kprobe); + ret = trace_run_command("-:testprobe2", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 52478f033f88..d59357308677 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -623,92 +623,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) kfree(arg->comm); } -int traceprobe_command(const char *buf, int (*createfn)(int, char **)) -{ - char **argv; - int argc, ret; - - argc = 0; - ret = 0; - argv = argv_split(GFP_KERNEL, buf, &argc); - if (!argv) - return -ENOMEM; - - if (argc) - ret = createfn(argc, argv); - - argv_free(argv); - - return ret; -} - -#define WRITE_BUFSIZE 4096 - -ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos, - int (*createfn)(int, char **)) -{ - char *kbuf, *buf, *tmp; - int ret = 0; - size_t done = 0; - size_t size; - - kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - - while (done < count) { - size = count - done; - - if (size >= WRITE_BUFSIZE) - size = WRITE_BUFSIZE - 1; - - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } - kbuf[size] = '\0'; - buf = kbuf; - do { - tmp = strchr(buf, '\n'); - if (tmp) { - *tmp = '\0'; - size = tmp - buf + 1; - } else { - size = strlen(buf); - if (done + size < count) { - if (buf != kbuf) - break; - /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ - pr_warn("Line length is too long: Should be less than %d\n", - WRITE_BUFSIZE - 2); - ret = -EINVAL; - goto out; - } - } - done += size; - - /* Remove comments */ - tmp = strchr(buf, '#'); - - if (tmp) - *tmp = '\0'; - - ret = traceprobe_command(buf, createfn); - if (ret) - goto out; - buf += size; - - } while (done < count); - } - ret = done; - -out: - kfree(kbuf); - - return ret; -} - static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, bool is_return) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 903273c93e61..fb66e3eaa192 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -42,7 +42,6 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 -#define MAX_EVENT_NAME_LEN 64 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ @@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); -extern ssize_t traceprobe_probes_write(struct file *file, - const char __user *buffer, size_t count, loff_t *ppos, - int (*createfn)(int, char**)); - -extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); - /* Sum up total data length for dynamic arraies (strings) */ static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4525e0271a53..b34965e62fb9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -651,7 +651,7 @@ static int probes_open(struct inode *inode, struct file *file) static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe); + return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe); } static const struct file_operations uprobe_events_ops = { -- cgit v1.2.3 From 0d7a8325bf3326c92da2d21b4496a9ddde896d4f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:21 -0500 Subject: tracing: Clean up hist_field_flags enum As we add more flags, specifying explicit integers for the flag values becomes more unwieldy and error-prone - switch them over to left-shift values. Link: http://lkml.kernel.org/r/e644e4fb7665aec015f4a2d84a2f990d3dd5b8a1.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 121d56850f24..0c7ec3048624 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -110,16 +110,16 @@ DEFINE_HIST_FIELD_FN(u8); #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) enum hist_field_flags { - HIST_FIELD_FL_HITCOUNT = 1, - HIST_FIELD_FL_KEY = 2, - HIST_FIELD_FL_STRING = 4, - HIST_FIELD_FL_HEX = 8, - HIST_FIELD_FL_SYM = 16, - HIST_FIELD_FL_SYM_OFFSET = 32, - HIST_FIELD_FL_EXECNAME = 64, - HIST_FIELD_FL_SYSCALL = 128, - HIST_FIELD_FL_STACKTRACE = 256, - HIST_FIELD_FL_LOG2 = 512, + HIST_FIELD_FL_HITCOUNT = 1 << 0, + HIST_FIELD_FL_KEY = 1 << 1, + HIST_FIELD_FL_STRING = 1 << 2, + HIST_FIELD_FL_HEX = 1 << 3, + HIST_FIELD_FL_SYM = 1 << 4, + HIST_FIELD_FL_SYM_OFFSET = 1 << 5, + HIST_FIELD_FL_EXECNAME = 1 << 6, + HIST_FIELD_FL_SYSCALL = 1 << 7, + HIST_FIELD_FL_STACKTRACE = 1 << 8, + HIST_FIELD_FL_LOG2 = 1 << 9, }; struct hist_trigger_attrs { -- cgit v1.2.3 From 85013256cf01629f72a327674c5d007b4a4b40da Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:22 -0500 Subject: tracing: Add hist_field_name() accessor In preparation for hist_fields that won't be strictly based on trace_event_fields, add a new hist_field_name() accessor to allow that flexibility and update associated users. Link: http://lkml.kernel.org/r/5b5a2d36dde067cbbe2434b10f06daac27b7dbd5.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 67 +++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0c7ec3048624..34edf5fd85fd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -146,6 +146,23 @@ struct hist_trigger_data { struct tracing_map *map; }; +static const char *hist_field_name(struct hist_field *field, + unsigned int level) +{ + const char *field_name = ""; + + if (level > 1) + return field_name; + + if (field->field) + field_name = field->field->name; + + if (field_name == NULL) + field_name = ""; + + return field_name; +} + static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) { hist_field_fn_t fn = NULL; @@ -653,7 +670,6 @@ static int is_descending(const char *str) static int create_sort_keys(struct hist_trigger_data *hist_data) { char *fields_str = hist_data->attrs->sort_key_str; - struct ftrace_event_field *field = NULL; struct tracing_map_sort_key *sort_key; int descending, ret = 0; unsigned int i, j; @@ -670,7 +686,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) } for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) { + struct hist_field *hist_field; char *field_str, *field_name; + const char *test_name; sort_key = &hist_data->sort_keys[i]; @@ -703,8 +721,10 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) } for (j = 1; j < hist_data->n_fields; j++) { - field = hist_data->fields[j]->field; - if (field && (strcmp(field_name, field->name) == 0)) { + hist_field = hist_data->fields[j]; + test_name = hist_field_name(hist_field, 0); + + if (strcmp(field_name, test_name) == 0) { sort_key->field_idx = j; descending = is_descending(field_str); if (descending < 0) { @@ -952,6 +972,7 @@ hist_trigger_entry_print(struct seq_file *m, struct hist_field *key_field; char str[KSYM_SYMBOL_LEN]; bool multiline = false; + const char *field_name; unsigned int i; u64 uval; @@ -963,26 +984,27 @@ hist_trigger_entry_print(struct seq_file *m, if (i > hist_data->n_vals) seq_puts(m, ", "); + field_name = hist_field_name(key_field, 0); + if (key_field->flags & HIST_FIELD_FL_HEX) { uval = *(u64 *)(key + key_field->offset); - seq_printf(m, "%s: %llx", - key_field->field->name, uval); + seq_printf(m, "%s: %llx", field_name, uval); } else if (key_field->flags & HIST_FIELD_FL_SYM) { uval = *(u64 *)(key + key_field->offset); sprint_symbol_no_offset(str, uval); - seq_printf(m, "%s: [%llx] %-45s", - key_field->field->name, uval, str); + seq_printf(m, "%s: [%llx] %-45s", field_name, + uval, str); } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { uval = *(u64 *)(key + key_field->offset); sprint_symbol(str, uval); - seq_printf(m, "%s: [%llx] %-55s", - key_field->field->name, uval, str); + seq_printf(m, "%s: [%llx] %-55s", field_name, + uval, str); } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { char *comm = elt->private_data; uval = *(u64 *)(key + key_field->offset); - seq_printf(m, "%s: %-16s[%10llu]", - key_field->field->name, comm, uval); + seq_printf(m, "%s: %-16s[%10llu]", field_name, + comm, uval); } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) { const char *syscall_name; @@ -991,8 +1013,8 @@ hist_trigger_entry_print(struct seq_file *m, if (!syscall_name) syscall_name = "unknown_syscall"; - seq_printf(m, "%s: %-30s[%3llu]", - key_field->field->name, syscall_name, uval); + seq_printf(m, "%s: %-30s[%3llu]", field_name, + syscall_name, uval); } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { seq_puts(m, "stacktrace:\n"); hist_trigger_stacktrace_print(m, @@ -1000,15 +1022,14 @@ hist_trigger_entry_print(struct seq_file *m, HIST_STACKTRACE_DEPTH); multiline = true; } else if (key_field->flags & HIST_FIELD_FL_LOG2) { - seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name, + seq_printf(m, "%s: ~ 2^%-2llu", field_name, *(u64 *)(key + key_field->offset)); } else if (key_field->flags & HIST_FIELD_FL_STRING) { - seq_printf(m, "%s: %-50s", key_field->field->name, + seq_printf(m, "%s: %-50s", field_name, (char *)(key + key_field->offset)); } else { uval = *(u64 *)(key + key_field->offset); - seq_printf(m, "%s: %10llu", key_field->field->name, - uval); + seq_printf(m, "%s: %10llu", field_name, uval); } } @@ -1021,13 +1042,13 @@ hist_trigger_entry_print(struct seq_file *m, tracing_map_read_sum(elt, HITCOUNT_IDX)); for (i = 1; i < hist_data->n_vals; i++) { + field_name = hist_field_name(hist_data->fields[i], 0); + if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { - seq_printf(m, " %s: %10llx", - hist_data->fields[i]->field->name, + seq_printf(m, " %s: %10llx", field_name, tracing_map_read_sum(elt, i)); } else { - seq_printf(m, " %s: %10llu", - hist_data->fields[i]->field->name, + seq_printf(m, " %s: %10llu", field_name, tracing_map_read_sum(elt, i)); } } @@ -1140,7 +1161,9 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { - seq_printf(m, "%s", hist_field->field->name); + const char *field_name = hist_field_name(hist_field, 0); + + seq_printf(m, "%s", field_name); if (hist_field->flags) { const char *flags_str = get_hist_field_flags(hist_field); -- cgit v1.2.3 From 5819eaddf35b24d628ddfa4fbb5f8d4026e44b96 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:23 -0500 Subject: tracing: Reimplement log2 log2 as currently implemented applies only to u64 trace_event_field derived fields, and assumes that anything it's applied to is a u64 field. To prepare for synthetic fields like latencies, log2 should be applicable to those as well, so take the opportunity now to fix the current problems as well as expand to more general uses. log2 should be thought of as a chaining function rather than a field type. To enable this as well as possible future function implementations, add a hist_field operand array into the hist_field definition for this purpose, and make use of it to implement the log2 'function'. Link: http://lkml.kernel.org/r/b47f93fc0b87b36eccf716b0c018f3a71e1f1111.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 34edf5fd85fd..1e1558c99d56 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -28,12 +28,16 @@ struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); +#define HIST_FIELD_OPERANDS_MAX 2 + struct hist_field { struct ftrace_event_field *field; unsigned long flags; hist_field_fn_t fn; unsigned int size; unsigned int offset; + unsigned int is_signed; + struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; }; static u64 hist_field_none(struct hist_field *field, void *event) @@ -71,7 +75,9 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event) static u64 hist_field_log2(struct hist_field *hist_field, void *event) { - u64 val = *(u64 *)(event + hist_field->field->offset); + struct hist_field *operand = hist_field->operands[0]; + + u64 val = operand->fn(operand, event); return (u64) ilog2(roundup_pow_of_two(val)); } @@ -156,6 +162,8 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; + else if (field->flags & HIST_FIELD_FL_LOG2) + field_name = hist_field_name(field->operands[0], ++level); if (field_name == NULL) field_name = ""; @@ -357,8 +365,20 @@ static const struct tracing_map_ops hist_trigger_elt_comm_ops = { .elt_init = hist_trigger_elt_comm_init, }; -static void destroy_hist_field(struct hist_field *hist_field) +static void destroy_hist_field(struct hist_field *hist_field, + unsigned int level) { + unsigned int i; + + if (level > 2) + return; + + if (!hist_field) + return; + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) + destroy_hist_field(hist_field->operands[i], level + 1); + kfree(hist_field); } @@ -385,7 +405,10 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, } if (flags & HIST_FIELD_FL_LOG2) { + unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; hist_field->fn = hist_field_log2; + hist_field->operands[0] = create_hist_field(field, fl); + hist_field->size = hist_field->operands[0]->size; goto out; } @@ -405,7 +428,7 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, hist_field->fn = select_value_fn(field->size, field->is_signed); if (!hist_field->fn) { - destroy_hist_field(hist_field); + destroy_hist_field(hist_field, 0); return NULL; } } @@ -422,7 +445,7 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { if (hist_data->fields[i]) { - destroy_hist_field(hist_data->fields[i]); + destroy_hist_field(hist_data->fields[i], 0); hist_data->fields[i] = NULL; } } -- cgit v1.2.3 From b35bd0d9f8a8ea17aae40893e18274d191a2d2c5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 30 Sep 2017 23:39:05 -0600 Subject: sysctl: remove /proc/sys/vm/nr_pdflush_threads This tunable has been obsolete since 2.6.32, and writes to the file have been failing and complaining in dmesg since then: nr_pdflush_threads exported in /proc is scheduled for removal That was 8 years ago. Remove the file ABI obsolete notice, and the sysfs file. Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- kernel/sysctl.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..a5dd8d82c253 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1344,11 +1344,6 @@ static struct ctl_table vm_table[] = { .proc_handler = dirtytime_interval_handler, .extra1 = &zero, }, - { - .procname = "nr_pdflush_threads", - .mode = 0444 /* read-only */, - .proc_handler = pdflush_proc_obsolete, - }, { .procname = "swappiness", .data = &vm_swappiness, -- cgit v1.2.3 From 6cafbe159416822f6d3dfd711bf4c39050c650ba Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 20 Jun 2017 10:44:58 -0400 Subject: ftrace: Add a ftrace_free_mem() function for modules to use In order to be able to trace module init functions, the module code needs to tell ftrace what is being freed when the init sections are freed. Use the code that the main init calls to tell ftrace to free the main init sections. This requires passing in a start and end address to free. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6abfafd7f173..84cb5928665a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5868,10 +5868,10 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ -void __init ftrace_free_init_mem(void) +void ftrace_free_mem(void *start_ptr, void *end_ptr) { - unsigned long start = (unsigned long)(&__init_begin); - unsigned long end = (unsigned long)(&__init_end); + unsigned long start = (unsigned long)(start_ptr); + unsigned long end = (unsigned long)(end_ptr); struct ftrace_page **last_pg = &ftrace_pages_start; struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -5913,6 +5913,14 @@ void __init ftrace_free_init_mem(void) mutex_unlock(&ftrace_lock); } +void __init ftrace_free_init_mem(void) +{ + void *start = (void *)(&__init_begin); + void *end = (void *)(&__init_end); + + ftrace_free_mem(start, end); +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; -- cgit v1.2.3 From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:21 -0700 Subject: bpf: multi program support for cgroup+bpf introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple bpf programs to a cgroup. The difference between three possible flags for BPF_PROG_ATTACH command: - NONE(default): No further bpf programs allowed in the subtree. - BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, the program in this cgroup yields to sub-cgroup program. - BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, that cgroup program gets run in addition to the program in this cgroup. NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't change their behavior. It only clarifies the semantics in relation to new flag. Only one program is allowed to be attached to a cgroup with NONE or BPF_F_ALLOW_OVERRIDE flag. Multiple programs are allowed to be attached to a cgroup with BPF_F_ALLOW_MULTI flag. They are executed in FIFO order (those that were attached first, run first) The programs of sub-cgroup are executed first, then programs of this cgroup and then programs of parent cgroup. All eligible programs are executed regardless of return code from earlier programs. To allow efficient execution of multiple programs attached to a cgroup and to avoid penalizing cgroups without any programs attached introduce 'struct bpf_prog_array' which is RCU protected array of pointers to bpf programs. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 467 +++++++++++++++++++++++++++++++++++-------------- kernel/bpf/core.c | 31 ++++ kernel/bpf/syscall.c | 37 ++-- kernel/cgroup/cgroup.c | 28 ++- 4 files changed, 414 insertions(+), 149 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 546113430049..6b7500bbdb53 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,129 +27,361 @@ void cgroup_bpf_put(struct cgroup *cgrp) { unsigned int type; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { - struct bpf_prog *prog = cgrp->bpf.prog[type]; - - if (prog) { - bpf_prog_put(prog); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog_list *pl, *tmp; + + list_for_each_entry_safe(pl, tmp, progs, node) { + list_del(&pl->node); + bpf_prog_put(pl->prog); + kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_prog_array_free(cgrp->bpf.effective[type]); + } +} + +/* count number of elements in the list. + * it's slow but the list cannot be long + */ +static u32 prog_list_length(struct list_head *head) +{ + struct bpf_prog_list *pl; + u32 cnt = 0; + + list_for_each_entry(pl, head, node) { + if (!pl->prog) + continue; + cnt++; } + return cnt; +} + +/* if parent has non-overridable prog attached, + * disallow attaching new programs to the descendent cgroup. + * if parent has overridable or multi-prog, allow attaching + */ +static bool hierarchy_allows_attach(struct cgroup *cgrp, + enum bpf_attach_type type, + u32 new_flags) +{ + struct cgroup *p; + + p = cgroup_parent(cgrp); + if (!p) + return true; + do { + u32 flags = p->bpf.flags[type]; + u32 cnt; + + if (flags & BPF_F_ALLOW_MULTI) + return true; + cnt = prog_list_length(&p->bpf.progs[type]); + WARN_ON_ONCE(cnt > 1); + if (cnt == 1) + return !!(flags & BPF_F_ALLOW_OVERRIDE); + p = cgroup_parent(p); + } while (p); + return true; +} + +/* compute a chain of effective programs for a given cgroup: + * start from the list of programs in this cgroup and add + * all parent programs. + * Note that parent's F_ALLOW_OVERRIDE-type program is yielding + * to programs in this cgroup + */ +static int compute_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu **array) +{ + struct bpf_prog_array __rcu *progs; + struct bpf_prog_list *pl; + struct cgroup *p = cgrp; + int cnt = 0; + + /* count number of effective programs by walking parents */ + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[type]); + p = cgroup_parent(p); + } while (p); + + progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!progs) + return -ENOMEM; + + /* populate the array with effective progs */ + cnt = 0; + p = cgrp; + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + list_for_each_entry(pl, + &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + rcu_dereference_protected(progs, 1)-> + progs[cnt++] = pl->prog; + } + p = cgroup_parent(p); + } while (p); + + *array = progs; + return 0; +} + +static void activate_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu *array) +{ + struct bpf_prog_array __rcu *old_array; + + old_array = xchg(&cgrp->bpf.effective[type], array); + /* free prog array after grace period, since __cgroup_bpf_run_*() + * might be still walking the array + */ + bpf_prog_array_free(old_array); } /** * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify - * @parent: the parent to inherit from */ -void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +int cgroup_bpf_inherit(struct cgroup *cgrp) { - unsigned int type; +/* has to use marco instead of const int, since compiler thinks + * that array below is variable length + */ +#define NR ARRAY_SIZE(cgrp->bpf.effective) + struct bpf_prog_array __rcu *arrays[NR] = {}; + int i; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { - struct bpf_prog *e; + for (i = 0; i < NR; i++) + INIT_LIST_HEAD(&cgrp->bpf.progs[i]); - e = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - rcu_assign_pointer(cgrp->bpf.effective[type], e); - cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; - } + for (i = 0; i < NR; i++) + if (compute_effective_progs(cgrp, i, &arrays[i])) + goto cleanup; + + for (i = 0; i < NR; i++) + activate_effective_progs(cgrp, i, arrays[i]); + + return 0; +cleanup: + for (i = 0; i < NR; i++) + bpf_prog_array_free(arrays[i]); + return -ENOMEM; } +#define BPF_CGROUP_MAX_PROGS 64 + /** - * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * __cgroup_bpf_attach() - Attach the program to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse - * @parent: The parent of @cgrp, or %NULL if @cgrp is the root - * @prog: A new program to pin - * @type: Type of pinning operation (ingress/egress) - * - * Each cgroup has a set of two pointers for bpf programs; one for eBPF - * programs it owns, and which is effective for execution. - * - * If @prog is not %NULL, this function attaches a new program to the cgroup - * and releases the one that is currently attached, if any. @prog is then made - * the effective program of type @type in that cgroup. - * - * If @prog is %NULL, the currently attached program of type @type is released, - * and the effective program of the parent cgroup (if any) is inherited to - * @cgrp. - * - * Then, the descendants of @cgrp are walked and the effective program for - * each of them is set to the effective program of @cgrp unless the - * descendant has its own program attached, in which case the subbranch is - * skipped. This ensures that delegated subcgroups with own programs are left - * untouched. + * @prog: A program to attach + * @type: Type of attach operation * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, - struct bpf_prog *prog, enum bpf_attach_type type, - bool new_overridable) +int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct bpf_prog *old_prog, *effective = NULL; - struct cgroup_subsys_state *pos; - bool overridable = true; - - if (parent) { - overridable = !parent->bpf.disallow_override[type]; - effective = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - } - - if (prog && effective && !overridable) - /* if parent has non-overridable prog attached, disallow - * attaching new programs to descendent cgroup - */ + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + bool pl_was_allocated; + u32 old_flags; + int err; + + if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + /* invalid combination */ + return -EINVAL; + + if (!hierarchy_allows_attach(cgrp, type, flags)) return -EPERM; - if (prog && effective && overridable != new_overridable) - /* if parent has overridable prog attached, only - * allow overridable programs in descendent cgroup + if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + /* Disallow attaching non-overridable on top + * of existing overridable in this cgroup. + * Disallow attaching multi-prog if overridable or none */ return -EPERM; - old_prog = cgrp->bpf.prog[type]; - - if (prog) { - overridable = new_overridable; - effective = prog; - if (old_prog && - cgrp->bpf.disallow_override[type] == new_overridable) - /* disallow attaching non-overridable on top - * of existing overridable in this cgroup - * and vice versa - */ - return -EPERM; + if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + return -E2BIG; + + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + pl->prog = prog; + list_add_tail(&pl->node, progs); + } else { + if (list_empty(progs)) { + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + list_add_tail(&pl->node, progs); + } else { + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl_was_allocated = false; + } + pl->prog = prog; } - if (!prog && !old_prog) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; + old_flags = cgrp->bpf.flags[type]; + cgrp->bpf.flags[type] = flags; - cgrp->bpf.prog[type] = prog; + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); - css_for_each_descendant_pre(pos, &cgrp->self) { - struct cgroup *desc = container_of(pos, struct cgroup, self); - - /* skip the subtree if the descendant has its own program */ - if (desc->bpf.prog[type] && desc != cgrp) { - pos = css_rightmost_descendant(pos); - } else { - rcu_assign_pointer(desc->bpf.effective[type], - effective); - desc->bpf.disallow_override[type] = !overridable; - } + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; } - if (prog) - static_branch_inc(&cgroup_bpf_enabled_key); + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + static_branch_inc(&cgroup_bpf_enabled_key); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and cleanup the prog list */ + pl->prog = old_prog; + if (pl_was_allocated) { + list_del(&pl->node); + kfree(pl); + } + return err; +} + +/** + * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @prog: A program to detach or NULL + * @type: Type of detach operation + * + * Must be called with cgroup_mutex held. + */ +int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 unused_flags) +{ + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + int err; + + if (flags & BPF_F_ALLOW_MULTI) { + if (!prog) + /* to detach MULTI prog the user has to specify valid FD + * of the program to be detached + */ + return -EINVAL; + } else { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return -ENOENT; + } + + if (flags & BPF_F_ALLOW_MULTI) { + /* find the prog and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog != prog) + continue; + old_prog = prog; + /* mark it deleted, so it's ignored while + * recomputing effective + */ + pl->prog = NULL; + break; + } + if (!old_prog) + return -ENOENT; + } else { + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) + */ + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl->prog = NULL; + } + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* now can actually delete it from this cgroup list */ + list_del(&pl->node); + kfree(pl); + if (list_empty(progs)) + /* last program was detached, reset flags to zero */ + cgrp->bpf.flags[type] = 0; + + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and restore back old_prog */ + pl->prog = old_prog; + return err; } /** @@ -171,36 +403,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type) { - struct bpf_prog *prog; + unsigned int offset = skb->data - skb_network_header(skb); + struct sock *save_sk; struct cgroup *cgrp; - int ret = 0; + int ret; if (!sk || !sk_fullsock(sk)) return 0; - if (sk->sk_family != AF_INET && - sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) { - unsigned int offset = skb->data - skb_network_header(skb); - struct sock *save_sk = skb->sk; - - skb->sk = sk; - __skb_push(skb, offset); - ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; - __skb_pull(skb, offset); - skb->sk = save_sk; - } - - rcu_read_unlock(); - - return ret; + save_sk = skb->sk; + skb->sk = sk; + __skb_push(skb, offset); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + bpf_prog_run_save_cb); + __skb_pull(skb, offset); + skb->sk = save_sk; + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -221,19 +443,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; - - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + int ret; - rcu_read_unlock(); - - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -258,18 +471,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; - - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; - - rcu_read_unlock(); + int ret; - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, + BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..6b49e1991ae7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); +/* to avoid allocating empty bpf_prog_array for cgroups that + * don't have bpf program attached use one global 'empty_prog_array' + * It will not be modified the caller of bpf_prog_array_alloc() + * (since caller requested prog_cnt == 0) + * that pointer should be 'freed' by bpf_prog_array_free() + */ +static struct { + struct bpf_prog_array hdr; + struct bpf_prog *null_prog; +} empty_prog_array = { + .null_prog = NULL, +}; + +struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +{ + if (prog_cnt) + return kzalloc(sizeof(struct bpf_prog_array) + + sizeof(struct bpf_prog *) * (prog_cnt + 1), + flags); + + return &empty_prog_array.hdr; +} + +void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +{ + if (!progs || + progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + return; + kfree_rcu(progs, rcu); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b927da66f653..51bee695d32c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return 0; } +#define BPF_F_ATTACH_MASK \ + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; @@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; - if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) + if (attr->attach_flags & ~BPF_F_ATTACH_MASK) return -EINVAL; switch (attr->attach_type) { @@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) return PTR_ERR(cgrp); } - ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, - attr->attach_flags & BPF_F_ALLOW_OVERRIDE); + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); if (ret) bpf_prog_put(prog); cgroup_put(cgrp); @@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { + enum bpf_prog_type ptype; + struct bpf_prog *prog; struct cgroup *cgrp; int ret; @@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: + ptype = BPF_PROG_TYPE_CGROUP_SKB; + break; case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; case BPF_CGROUP_SOCK_OPS: - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); - cgroup_put(cgrp); + ptype = BPF_PROG_TYPE_SOCK_OPS; break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - ret = sockmap_get_from_fd(attr, false); - break; + return sockmap_get_from_fd(attr, false); default: return -EINVAL; } + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + cgroup_put(cgrp); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..57eb866ae78d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) if (ret) goto destroy_root; + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + trace_cgroup_setup_root(root); /* @@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_idr_free; for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); - if (parent) - cgroup_bpf_inherit(cgrp, parent); - cgroup_propagate_control(cgrp); return cgrp; +out_idr_free: + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, bool overridable) +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct cgroup *parent = cgroup_parent(cgrp); int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); + ret = __cgroup_bpf_detach(cgrp, prog, type, flags); mutex_unlock(&cgroup_mutex); return ret; } -- cgit v1.2.3 From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:22 -0700 Subject: bpf: introduce BPF_PROG_QUERY command introduce BPF_PROG_QUERY command to retrieve a set of either attached programs to given cgroup or a set of effective programs that will execute for events within a cgroup Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 38 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++++++++ kernel/cgroup/cgroup.c | 10 ++++++++++ 4 files changed, 128 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6b7500bbdb53..e88abc0865d5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -384,6 +384,52 @@ cleanup: return err; } +/* Must be called with cgroup_mutex held to avoid races. */ +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + enum bpf_attach_type type = attr->query.attach_type; + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + int cnt, ret = 0, i; + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) + cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + else + cnt = prog_list_length(progs); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + /* return early if user requested only program count + flags */ + return 0; + if (attr->query.prog_cnt < cnt) { + cnt = attr->query.prog_cnt; + ret = -ENOSPC; + } + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], + prog_ids, cnt); + } else { + struct bpf_prog_list *pl; + u32 id; + + i = 0; + list_for_each_entry(pl, progs, node) { + id = pl->prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + } + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6b49e1991ae7..eba966c09053 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1412,6 +1412,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +{ + struct bpf_prog **prog; + u32 cnt = 0; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) + cnt++; + rcu_read_unlock(); + return cnt; +} + +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt) +{ + struct bpf_prog **prog; + u32 i = 0, id; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) { + id = (*prog)->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) { + rcu_read_unlock(); + return -EFAULT; + } + if (++i == cnt) { + prog++; + break; + } + } + rcu_read_unlock(); + if (*prog) + return -ENOSPC; + return 0; +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 51bee695d32c..0048cb24ba7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1272,6 +1272,37 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } +#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt + +static int bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (CHECK_ATTR(BPF_PROG_QUERY)) + return -EINVAL; + if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) + return -EINVAL; + + switch (attr->query.attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: + break; + default: + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + ret = cgroup_bpf_query(cgrp, attr, uattr); + cgroup_put(cgrp); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -1568,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; + case BPF_PROG_QUERY: + err = bpf_prog_query(&attr, uattr); + break; #endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 57eb866ae78d..269512b94a94 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5761,4 +5761,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, mutex_unlock(&cgroup_mutex); return ret; } +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_query(cgrp, attr, uattr); + mutex_unlock(&cgroup_mutex); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ -- cgit v1.2.3 From 390ee7e29fc8e6e90d3065b00afb047c4ee552f9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:23 -0700 Subject: bpf: enforce return code for cgroup-bpf programs with addition of tnum logic the verifier got smart enough and we can enforce return codes at program load time. For now do so for cgroup-bpf program types. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cf9b72c59a0..52b022310f6a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3073,6 +3073,43 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static int check_return_code(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *reg; + struct tnum range = tnum_range(0, 1); + + switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_SOCK_OPS: + break; + default: + return 0; + } + + reg = &env->cur_state.regs[BPF_REG_0]; + if (reg->type != SCALAR_VALUE) { + verbose("At program exit the register R0 is not a known value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + + if (!tnum_in(range, reg->var_off)) { + verbose("At program exit the register R0 "); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("has value %s", tn_buf); + } else { + verbose("has unknown scalar value"); + } + verbose(" should have been 0 or 1\n"); + return -EINVAL; + } + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -3863,6 +3900,9 @@ static int do_check(struct bpf_verifier_env *env) return -EACCES; } + err = check_return_code(env); + if (err) + return err; process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { -- cgit v1.2.3 From 58e1177b4cd10b0d358faf7d7ebb3779f98bc3ea Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:26:55 -0700 Subject: timer: Convert schedule_timeout() to use from_timer() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new from_timer() helper and passing the timer pointer explicitly. Since this special timer is on the stack, it needs to have a wrapper structure to carry state once .data is eliminated. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: "Rafael J. Wysocki" Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: Stephen Boyd Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-2-git-send-email-keescook@chromium.org --- kernel/time/timer.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f2674a056c26..38613ced2324 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1668,9 +1668,20 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -static void process_timeout(unsigned long __data) +/* + * Since schedule_timeout()'s timer is defined on the stack, it must store + * the target task on the stack as well. + */ +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) { - wake_up_process((struct task_struct *)__data); + struct process_timer *timeout = from_timer(timeout, t, timer); + + wake_up_process(timeout->task); } /** @@ -1704,7 +1715,7 @@ static void process_timeout(unsigned long __data) */ signed long __sched schedule_timeout(signed long timeout) { - struct timer_list timer; + struct process_timer timer; unsigned long expire; switch (timeout) @@ -1738,13 +1749,14 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; - setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false); + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, false); schedule(); - del_singleshot_timer_sync(&timer); + del_singleshot_timer_sync(&timer.timer); /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer); + destroy_timer_on_stack(&timer.timer); timeout = expire - jiffies; -- cgit v1.2.3 From 5cd79d6abd2c142352dead0e3df04e86ee32f5d3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:00 -0700 Subject: timer: Remove users of TIMER_DEFERRED_INITIALIZER This removes uses of TIMER_DEFERRED_INITIALIZER and chooses a location to call timer_setup() from before add_timer() or mod_timer() is called. Adjusts callbacks to use from_timer() as needed. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Tejun Heo Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-7-git-send-email-keescook@chromium.org --- kernel/workqueue.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..a5361fc6215d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5390,11 +5390,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } */ #ifdef CONFIG_WQ_WATCHDOG -static void wq_watchdog_timer_fn(unsigned long data); - static unsigned long wq_watchdog_thresh = 30; -static struct timer_list wq_watchdog_timer = - TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); +static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; @@ -5408,7 +5405,7 @@ static void wq_watchdog_reset_touched(void) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; } -static void wq_watchdog_timer_fn(unsigned long data) +static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; @@ -5510,6 +5507,7 @@ module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, static void wq_watchdog_init(void) { + timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE); wq_watchdog_set_thresh(wq_watchdog_thresh); } -- cgit v1.2.3 From 1d27e3e2252ba9d949ca82fbdb73cde102cb2067 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:04 -0700 Subject: timer: Remove expires and data arguments from DEFINE_TIMER Drop the arguments from the macro and adjust all callers with the following script: perl -pi -e 's/DEFINE_TIMER\((.*), 0, 0\);/DEFINE_TIMER($1);/g;' \ $(git grep DEFINE_TIMER | cut -d: -f1 | sort -u | grep -v timer.h) Signed-off-by: Kees Cook Acked-by: Geert Uytterhoeven # for m68k parts Acked-by: Guenter Roeck # for watchdog parts Acked-by: David S. Miller # for networking parts Acked-by: Greg Kroah-Hartman Acked-by: Kalle Valo # for wireless parts Acked-by: Arnd Bergmann Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Michael Reed Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Tejun Heo Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Guenter Roeck Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-11-git-send-email-keescook@chromium.org Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 061ba7eed4ed..c805e8691c22 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -20,7 +20,7 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(unsigned long dummy); -static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); static int irq_poll_cpu; static atomic_t irq_poll_active; -- cgit v1.2.3 From fe5c3b69b540e3387223a696f327c1bb8880d1ac Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:06 -0700 Subject: kthread: Convert callback to use from_timer() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch kthread to use from_timer() and pass the timer pointer explicitly. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Petr Mladek Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Tejun Heo Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-13-git-send-email-keescook@chromium.org --- kernel/kthread.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c19edf82427..ba3992c8c375 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -798,15 +798,14 @@ EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. - * @__data: pointer to the data associated with the timer + * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ -void kthread_delayed_work_timer_fn(unsigned long __data) +void kthread_delayed_work_timer_fn(struct timer_list *t) { - struct kthread_delayed_work *dwork = - (struct kthread_delayed_work *)__data; + struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; @@ -837,8 +836,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker, struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; - WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn || - timer->data != (unsigned long)dwork); + WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for -- cgit v1.2.3 From 8c20feb60604d91a29cd7fef8ac758bd92d9fd2c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 16:27:07 -0700 Subject: workqueue: Convert callback to use from_timer() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch workqueue to use from_timer() and pass the timer pointer explicitly. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Lai Jiangshan Cc: Sebastian Reichel Cc: Kalle Valo Cc: Paul Mackerras Cc: Pavel Machek Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" Cc: Wim Van Sebroeck Cc: Michael Ellerman Cc: Ursula Braun Cc: Geert Uytterhoeven Cc: Viresh Kumar Cc: Harish Patil Cc: Stephen Boyd Cc: Guenter Roeck Cc: Manish Chopra Cc: Len Brown Cc: Arnd Bergmann Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Julian Wiedmann Cc: John Stultz Cc: Mark Gross Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Ralf Baechle Cc: Stefan Richter Cc: Michael Reed Cc: netdev@vger.kernel.org Cc: Tejun Heo Cc: Andrew Morton Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee Link: https://lkml.kernel.org/r/1507159627-127660-14-git-send-email-keescook@chromium.org --- kernel/workqueue.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a5361fc6215d..c77fdf6bf24f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1492,9 +1492,9 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL(queue_work_on); -void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(struct timer_list *t) { - struct delayed_work *dwork = (struct delayed_work *)__data; + struct delayed_work *dwork = from_timer(dwork, t, timer); /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); @@ -1508,8 +1508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); - WARN_ON_ONCE(timer->function != delayed_work_timer_fn || - timer->data != (unsigned long)dwork); + WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); -- cgit v1.2.3 From 3e234289f86b12985ef8909cd34525fcb66c4efb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 3 Mar 2017 18:00:22 -0500 Subject: ftrace: Allow module init functions to be traced Allow for module init sections to be traced as well as core kernel init sections. Now that filtering modules functions can be stored, for when they are loaded, it makes sense to be able to trace them. Cc: Jessica Yu Cc: Rusty Russell Signed-off-by: Steven Rostedt (VMware) --- kernel/module.c | 2 ++ kernel/trace/ftrace.c | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..58bca427ac3f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3473,6 +3473,8 @@ static noinline int do_init_module(struct module *mod) if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) async_synchronize_full(); + ftrace_free_mem(mod->init_layout.base, mod->init_layout.base + + mod->init_layout.size); mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84cb5928665a..d7297e866e4a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5752,7 +5752,8 @@ void ftrace_release_mod(struct module *mod) last_pg = &ftrace_pages_start; for (pg = ftrace_pages_start; pg; pg = *last_pg) { rec = &pg->records[0]; - if (within_module_core(rec->ip, mod)) { + if (within_module_core(rec->ip, mod) || + within_module_init(rec->ip, mod)) { /* * As core pages are first, the first * page should never be a module page. @@ -5821,7 +5822,8 @@ void ftrace_module_enable(struct module *mod) * not part of this module, then skip this pg, * which the "break" will do. */ - if (!within_module_core(rec->ip, mod)) + if (!within_module_core(rec->ip, mod) && + !within_module_init(rec->ip, mod)) break; cnt = 0; -- cgit v1.2.3 From aba4b5c22cbac296f4081a0476d0c55828f135b4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 1 Sep 2017 08:35:38 -0400 Subject: ftrace: Save module init functions kallsyms symbols for tracing If function tracing is active when the module init functions are freed, then store them to be referenced by kallsyms. As module init functions can now be traced on module load, they were useless: ># echo ':mod:snd_seq' > set_ftrace_filter ># echo function > current_tracer ># modprobe snd_seq ># cat trace # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | modprobe-2786 [000] .... 3189.037874: 0xffffffffa0860000 <-do_one_initcall modprobe-2786 [000] .... 3189.037876: 0xffffffffa086004d <-0xffffffffa086000f modprobe-2786 [000] .... 3189.037876: 0xffffffffa086010d <-0xffffffffa0860018 modprobe-2786 [000] .... 3189.037877: 0xffffffffa086011a <-0xffffffffa0860021 modprobe-2786 [000] .... 3189.037877: 0xffffffffa0860080 <-0xffffffffa086002a modprobe-2786 [000] .... 3189.039523: 0xffffffffa0860400 <-0xffffffffa0860033 modprobe-2786 [000] .... 3189.039523: 0xffffffffa086038a <-0xffffffffa086041c modprobe-2786 [000] .... 3189.039591: 0xffffffffa086038a <-0xffffffffa0860436 modprobe-2786 [000] .... 3189.039657: 0xffffffffa086038a <-0xffffffffa0860450 modprobe-2786 [000] .... 3189.039719: 0xffffffffa0860127 <-0xffffffffa086003c modprobe-2786 [000] .... 3189.039742: snd_seq_create_kernel_client <-0xffffffffa08601f6 When the output is shown, the kallsyms for the module init functions have already been freed, and the output of the trace can not convert them to their function names. Now this looks like this: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | modprobe-2463 [002] .... 174.243237: alsa_seq_init <-do_one_initcall modprobe-2463 [002] .... 174.243239: client_init_data <-alsa_seq_init modprobe-2463 [002] .... 174.243240: snd_sequencer_memory_init <-alsa_seq_init modprobe-2463 [002] .... 174.243240: snd_seq_queues_init <-alsa_seq_init modprobe-2463 [002] .... 174.243240: snd_sequencer_device_init <-alsa_seq_init modprobe-2463 [002] .... 174.244860: snd_seq_info_init <-alsa_seq_init modprobe-2463 [002] .... 174.244861: create_info_entry <-snd_seq_info_init modprobe-2463 [002] .... 174.244936: create_info_entry <-snd_seq_info_init modprobe-2463 [002] .... 174.245003: create_info_entry <-snd_seq_info_init modprobe-2463 [002] .... 174.245072: snd_seq_system_client_init <-alsa_seq_init modprobe-2463 [002] .... 174.245094: snd_seq_create_kernel_client <-snd_seq_system_client_init Signed-off-by: Steven Rostedt (VMware) --- kernel/kallsyms.c | 5 ++ kernel/module.c | 2 +- kernel/trace/ftrace.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 150 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 127e7cfafa55..976ecb9275d9 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr, if (!ret) ret = bpf_address_lookup(addr, symbolsize, offset, modname, namebuf); + + if (!ret) + ret = ftrace_mod_address_lookup(addr, symbolsize, + offset, modname, namebuf); return ret; } diff --git a/kernel/module.c b/kernel/module.c index 58bca427ac3f..279a469dc375 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3473,7 +3473,7 @@ static noinline int do_init_module(struct module *mod) if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) async_synchronize_full(); - ftrace_free_mem(mod->init_layout.base, mod->init_layout.base + + ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + mod->init_layout.size); mutex_lock(&module_mutex); /* Drop initial reference. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d7297e866e4a..86dbbfb353db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5675,6 +5675,21 @@ static int ftrace_process_locs(struct module *mod, return ret; } +struct ftrace_mod_func { + struct list_head list; + char *name; + unsigned long ip; + unsigned int size; +}; + +struct ftrace_mod_map { + struct list_head list; + struct module *mod; + unsigned long start_addr; + unsigned long end_addr; + struct list_head funcs; +}; + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -5868,9 +5883,123 @@ void ftrace_module_init(struct module *mod) ftrace_process_locs(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); } + +static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, + struct dyn_ftrace *rec) +{ + struct ftrace_mod_func *mod_func; + unsigned long symsize; + unsigned long offset; + char str[KSYM_SYMBOL_LEN]; + char *modname; + const char *ret; + + ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str); + if (!ret) + return; + + mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL); + if (!mod_func) + return; + + mod_func->name = kstrdup(str, GFP_KERNEL); + if (!mod_func->name) { + kfree(mod_func); + return; + } + + mod_func->ip = rec->ip - offset; + mod_func->size = symsize; + + list_add_rcu(&mod_func->list, &mod_map->funcs); +} + +static LIST_HEAD(ftrace_mod_maps); + +static struct ftrace_mod_map * +allocate_ftrace_mod_map(struct module *mod, + unsigned long start, unsigned long end) +{ + struct ftrace_mod_map *mod_map; + + mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); + if (!mod_map) + return NULL; + + mod_map->mod = mod; + mod_map->start_addr = start; + mod_map->end_addr = end; + + INIT_LIST_HEAD_RCU(&mod_map->funcs); + + list_add_rcu(&mod_map->list, &ftrace_mod_maps); + + return mod_map; +} + +static const char * +ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, + unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) +{ + struct ftrace_mod_func *found_func = NULL; + struct ftrace_mod_func *mod_func; + + list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) { + if (addr >= mod_func->ip && + addr < mod_func->ip + mod_func->size) { + found_func = mod_func; + break; + } + } + + if (found_func) { + if (size) + *size = found_func->size; + if (off) + *off = addr - found_func->ip; + if (sym) + strlcpy(sym, found_func->name, KSYM_NAME_LEN); + + return found_func->name; + } + + return NULL; +} + +const char * +ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char **modname, char *sym) +{ + struct ftrace_mod_map *mod_map; + const char *ret = NULL; + + preempt_disable(); + list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { + ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym); + if (ret) { + if (modname) + *modname = mod_map->mod->name; + break; + } + } + preempt_enable(); + + return ret; +} + +#else +static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, + struct dyn_ftrace *rec) { } +static inline struct ftrace_mod_map * +allocate_ftrace_mod_map(struct module *mod, + unsigned long start, unsigned long end) +{ + return NULL; +} #endif /* CONFIG_MODULES */ -void ftrace_free_mem(void *start_ptr, void *end_ptr) +void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) { unsigned long start = (unsigned long)(start_ptr); unsigned long end = (unsigned long)(end_ptr); @@ -5878,6 +6007,7 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr) struct ftrace_page *pg; struct dyn_ftrace *rec; struct dyn_ftrace key; + struct ftrace_mod_map *mod_map = NULL; int order; key.ip = start; @@ -5885,6 +6015,14 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr) mutex_lock(&ftrace_lock); + /* + * If we are freeing module init memory, then check if + * any tracer is active. If so, we need to save a mapping of + * the module functions being freed with the address. + */ + if (mod && ftrace_ops_list != &ftrace_list_end) + mod_map = allocate_ftrace_mod_map(mod, start, end); + for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) { if (end < pg->records[0].ip || start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) @@ -5895,6 +6033,10 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr) ftrace_cmp_recs); if (!rec) continue; + + if (mod_map) + save_ftrace_mod_rec(mod_map, rec); + pg->index--; ftrace_update_tot_cnt--; if (!pg->index) { @@ -5920,7 +6062,7 @@ void __init ftrace_free_init_mem(void) void *start = (void *)(&__init_begin); void *end = (void *)(&__init_end); - ftrace_free_mem(start, end); + ftrace_free_mem(NULL, start, end); } void __init ftrace_init(void) -- cgit v1.2.3 From 6aa69784b43eb5f69120339938c50a97a433049f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 5 Sep 2017 19:20:16 -0400 Subject: ftrace: Add freeing algorithm to free ftrace_mod_maps The ftrace_mod_map is a descriptor to save module init function names in case they were traced, and the trace output needs to reference the function name from the function address. But after the function is unloaded, it the maps should be freed, as the rest of the function names are as well. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 86dbbfb353db..a5824408bed9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5683,6 +5683,7 @@ struct ftrace_mod_func { }; struct ftrace_mod_map { + struct rcu_head rcu; struct list_head list; struct module *mod; unsigned long start_addr; @@ -5694,6 +5695,8 @@ struct ftrace_mod_map { #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) +static LIST_HEAD(ftrace_mod_maps); + static int referenced_filters(struct dyn_ftrace *rec) { struct ftrace_ops *ops; @@ -5747,8 +5750,26 @@ static void clear_mod_from_hashes(struct ftrace_page *pg) mutex_unlock(&trace_types_lock); } +static void ftrace_free_mod_map(struct rcu_head *rcu) +{ + struct ftrace_mod_map *mod_map = container_of(rcu, struct ftrace_mod_map, rcu); + struct ftrace_mod_func *mod_func; + struct ftrace_mod_func *n; + + /* All the contents of mod_map are now not visible to readers */ + list_for_each_entry_safe(mod_func, n, &mod_map->funcs, list) { + kfree(mod_func->name); + list_del(&mod_func->list); + kfree(mod_func); + } + + kfree(mod_map); +} + void ftrace_release_mod(struct module *mod) { + struct ftrace_mod_map *mod_map; + struct ftrace_mod_map *n; struct dyn_ftrace *rec; struct ftrace_page **last_pg; struct ftrace_page *tmp_page = NULL; @@ -5760,6 +5781,14 @@ void ftrace_release_mod(struct module *mod) if (ftrace_disabled) goto out_unlock; + list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { + if (mod_map->mod == mod) { + list_del_rcu(&mod_map->list); + call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map); + break; + } + } + /* * Each module has its own ftrace_pages, remove * them from the list. @@ -5914,8 +5943,6 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, list_add_rcu(&mod_func->list, &mod_map->funcs); } -static LIST_HEAD(ftrace_mod_maps); - static struct ftrace_mod_map * allocate_ftrace_mod_map(struct module *mod, unsigned long start, unsigned long end) @@ -5974,6 +6001,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, struct ftrace_mod_map *mod_map; const char *ret = NULL; + /* mod_map is freed via call_rcu_sched() */ preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym); -- cgit v1.2.3 From 6171a0310a06a7a0cb83713fa7068bdd4192de19 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 6 Sep 2017 08:40:41 -0400 Subject: ftrace/kallsyms: Have /proc/kallsyms show saved mod init functions If a module is loaded while tracing is enabled, then there's a possibility that the module init functions were traced. These functions have their name and address stored by ftrace such that it can translate the function address that is written into the buffer into a human readable function name. As userspace tools may be doing the same, they need a way to map function names to their address as well. This is done through reading /proc/kallsyms. Signed-off-by: Steven Rostedt (VMware) --- kernel/kallsyms.c | 38 ++++++++++++++++++++++++++++++++------ kernel/trace/ftrace.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 976ecb9275d9..1966fe1c2b57 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -479,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol); struct kallsym_iter { loff_t pos; loff_t pos_mod_end; + loff_t pos_ftrace_mod_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -501,11 +502,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) return 1; } +static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) +{ + int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, + &iter->value, &iter->type, + iter->name, iter->module_name, + &iter->exported); + if (ret < 0) { + iter->pos_ftrace_mod_end = iter->pos; + return 0; + } + + return 1; +} + static int get_ksymbol_bpf(struct kallsym_iter *iter) { iter->module_name[0] = '\0'; iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_mod_end, + return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, iter->name) < 0 ? 0 : 1; } @@ -530,20 +545,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; - if (new_pos == 0) + if (new_pos == 0) { iter->pos_mod_end = 0; + iter->pos_ftrace_mod_end = 0; + } } static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; - if (iter->pos_mod_end > 0 && - iter->pos_mod_end < iter->pos) + if (iter->pos_ftrace_mod_end > 0 && + iter->pos_ftrace_mod_end < iter->pos) return get_ksymbol_bpf(iter); - if (!get_ksymbol_mod(iter)) - return get_ksymbol_bpf(iter); + if (iter->pos_mod_end > 0 && + iter->pos_mod_end < iter->pos) { + if (!get_ksymbol_ftrace_mod(iter)) + return get_ksymbol_bpf(iter); + return 1; + } + + if (!get_ksymbol_mod(iter)) { + if (!get_ksymbol_ftrace_mod(iter)) + return get_ksymbol_bpf(iter); + } return 1; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a5824408bed9..9e99bd55732e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5689,6 +5689,7 @@ struct ftrace_mod_map { unsigned long start_addr; unsigned long end_addr; struct list_head funcs; + unsigned int num_funcs; }; #ifdef CONFIG_MODULES @@ -5940,6 +5941,8 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, mod_func->ip = rec->ip - offset; mod_func->size = symsize; + mod_map->num_funcs++; + list_add_rcu(&mod_func->list, &mod_map->funcs); } @@ -5956,6 +5959,7 @@ allocate_ftrace_mod_map(struct module *mod, mod_map->mod = mod; mod_map->start_addr = start; mod_map->end_addr = end; + mod_map->num_funcs = 0; INIT_LIST_HEAD_RCU(&mod_map->funcs); @@ -6016,6 +6020,42 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, return ret; } +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, + char *module_name, int *exported) +{ + struct ftrace_mod_map *mod_map; + struct ftrace_mod_func *mod_func; + + preempt_disable(); + list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { + + if (symnum >= mod_map->num_funcs) { + symnum -= mod_map->num_funcs; + continue; + } + + list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) { + if (symnum > 1) { + symnum--; + continue; + } + + *value = mod_func->ip; + *type = 'T'; + strlcpy(name, mod_func->name, KSYM_NAME_LEN); + strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); + *exported = 1; + preempt_enable(); + return 0; + } + WARN_ON(1); + break; + } + preempt_enable(); + return -ERANGE; +} + #else static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, struct dyn_ftrace *rec) { } -- cgit v1.2.3 From aaecaa0b5f31794f1711247da4b5883a6ff98163 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 5 Oct 2017 17:54:31 -0700 Subject: tracing: Prepare to add preempt and irq trace events In preparation of adding irqsoff and preemptsoff enable and disable trace events, move required functions and code to make it easier to add these events in a later patch. This patch is just code movement and no functional change. Link: http://lkml.kernel.org/r/20171006005432.14244-2-joelaf@google.com Cc: Peter Zijlstra Cc: kernel-team@android.com Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_irqsoff.c | 100 ++++++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7758bc0617cb..0e3033c00474 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,7 @@ #include "trace.h" +#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -462,64 +463,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) #else /* !CONFIG_PROVE_LOCKING */ -/* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} - /* * We are only interested in hardirq on/off events: */ -void trace_hardirqs_on(void) +static inline void tracer_hardirqs_on(void) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_on); -void trace_hardirqs_off(void) +static inline void tracer_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_off); -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_on_caller); -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ #endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER -void trace_preempt_on(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } -void trace_preempt_off(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); @@ -781,3 +762,70 @@ __init static int init_irqsoff_tracer(void) return 0; } core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ + +#ifndef CONFIG_IRQSOFF_TRACER +static inline void tracer_hardirqs_on(void) { } +static inline void tracer_hardirqs_off(void) { } +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { } +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { } +#endif + +#ifndef CONFIG_PREEMPT_TRACER +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif + +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) +void trace_hardirqs_on(void) +{ + tracer_hardirqs_on(); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + tracer_hardirqs_off(); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + tracer_hardirqs_on_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + tracer_hardirqs_off_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +/* + * Stubs: + */ + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} +#endif + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + tracer_preempt_on(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + tracer_preempt_off(a0, a1); +} +#endif -- cgit v1.2.3 From 1bd845bcb41d5b7f83745e0cb99273eb376f2ec5 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 8 Sep 2017 20:57:09 +0200 Subject: padata: set cpu_index of unused CPUs to -1 The parallel queue per-cpu data structure gets initialized only for CPUs in the 'pcpu' CPU mask set. This is not sufficient as the reorder timer may run on a different CPU and might wrongly decide it's the target CPU for the next reorder item as per-cpu memory gets memset(0) and we might be waiting for the first CPU in cpumask.pcpu, i.e. cpu_index 0. Make the '__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index' compare in padata_get_next() fail in this case by initializing the cpu_index member of all per-cpu parallel queues. Use -1 for unused ones. Signed-off-by: Mathias Krause Signed-off-by: Herbert Xu --- kernel/padata.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 868f947166d7..1b9b4bac4a9b 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -384,8 +384,14 @@ static void padata_init_pqueues(struct parallel_data *pd) struct padata_parallel_queue *pqueue; cpu_index = 0; - for_each_cpu(cpu, pd->cpumask.pcpu) { + for_each_possible_cpu(cpu) { pqueue = per_cpu_ptr(pd->pqueue, cpu); + + if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) { + pqueue->cpu_index = -1; + continue; + } + pqueue->pd = pd; pqueue->cpu_index = cpu_index; cpu_index++; -- cgit v1.2.3 From cf5868c8a22dc2854b96e9569064bb92365549ca Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 8 Sep 2017 20:57:10 +0200 Subject: padata: ensure the reorder timer callback runs on the correct CPU The reorder timer function runs on the CPU where the timer interrupt was handled which is not necessarily one of the CPUs of the 'pcpu' CPU mask set. Ensure the padata_reorder() callback runs on the correct CPU, which is one in the 'pcpu' CPU mask set and, preferrably, the next expected one. Do so by comparing the current CPU with the expected target CPU. If they match, call padata_reorder() right away. If they differ, schedule a work item on the target CPU that does the padata_reorder() call for us. Signed-off-by: Mathias Krause Signed-off-by: Herbert Xu --- kernel/padata.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 1b9b4bac4a9b..b4066147bce4 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -275,11 +275,51 @@ static void padata_reorder(struct parallel_data *pd) return; } +static void invoke_padata_reorder(struct work_struct *work) +{ + struct padata_parallel_queue *pqueue; + struct parallel_data *pd; + + local_bh_disable(); + pqueue = container_of(work, struct padata_parallel_queue, reorder_work); + pd = pqueue->pd; + padata_reorder(pd); + local_bh_enable(); +} + static void padata_reorder_timer(unsigned long arg) { struct parallel_data *pd = (struct parallel_data *)arg; + unsigned int weight; + int target_cpu, cpu; - padata_reorder(pd); + cpu = get_cpu(); + + /* We don't lock pd here to not interfere with parallel processing + * padata_reorder() calls on other CPUs. We just need any CPU out of + * the cpumask.pcpu set. It would be nice if it's the right one but + * it doesn't matter if we're off to the next one by using an outdated + * pd->processed value. + */ + weight = cpumask_weight(pd->cpumask.pcpu); + target_cpu = padata_index_to_cpu(pd, pd->processed % weight); + + /* ensure to call the reorder callback on the correct CPU */ + if (cpu != target_cpu) { + struct padata_parallel_queue *pqueue; + struct padata_instance *pinst; + + /* The timer function is serialized wrt itself -- no locking + * needed. + */ + pinst = pd->pinst; + pqueue = per_cpu_ptr(pd->pqueue, target_cpu); + queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work); + } else { + padata_reorder(pd); + } + + put_cpu(); } static void padata_serial_worker(struct work_struct *serial_work) @@ -399,6 +439,7 @@ static void padata_init_pqueues(struct parallel_data *pd) __padata_list_init(&pqueue->reorder); __padata_list_init(&pqueue->parallel); INIT_WORK(&pqueue->work, padata_parallel_worker); + INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder); atomic_set(&pqueue->num_obj, 0); } } -- cgit v1.2.3 From 350ef88e7e922354f82a931897ad4a4ce6c686ff Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 8 Sep 2017 20:57:11 +0200 Subject: padata: ensure padata_do_serial() runs on the correct CPU If the algorithm we're parallelizing is asynchronous we might change CPUs between padata_do_parallel() and padata_do_serial(). However, we don't expect this to happen as we need to enqueue the padata object into the per-cpu reorder queue we took it from, i.e. the same-cpu's parallel queue. Ensure we're not switching CPUs for a given padata object by tracking the CPU within the padata object. If the serial callback gets called on the wrong CPU, defer invoking padata_reorder() via a kernel worker on the CPU we're expected to run on. Signed-off-by: Mathias Krause Signed-off-by: Herbert Xu --- kernel/padata.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index b4066147bce4..f262c9a4e70a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst, padata->cb_cpu = cb_cpu; target_cpu = padata_cpu_hash(pd); + padata->cpu = target_cpu; queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); @@ -363,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata) int cpu; struct padata_parallel_queue *pqueue; struct parallel_data *pd; + int reorder_via_wq = 0; pd = padata->pd; cpu = get_cpu(); + + /* We need to run on the same CPU padata_do_parallel(.., padata, ..) + * was called on -- or, at least, enqueue the padata object into the + * correct per-cpu queue. + */ + if (cpu != padata->cpu) { + reorder_via_wq = 1; + cpu = padata->cpu; + } + pqueue = per_cpu_ptr(pd->pqueue, cpu); spin_lock(&pqueue->reorder.lock); @@ -376,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata) put_cpu(); - padata_reorder(pd); + /* If we're running on the wrong CPU, call padata_reorder() via a + * kernel worker. + */ + if (reorder_via_wq) + queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work); + else + padata_reorder(pd); } EXPORT_SYMBOL(padata_do_serial); -- cgit v1.2.3 From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:19 -0700 Subject: bpf: perf event change needed for subsequent bpf helpers This patch does not impact existing functionalities. It contains the changes in perf event area needed for subsequent bpf_perf_event_read_value and bpf_perf_prog_read_value helpers. Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- kernel/events/core.c | 15 +++++++++++++-- kernel/trace/bpf_trace.c | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..68d866628be0 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..902149f05381 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; + u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event->oncpu == smp_processor_id()) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index dc498b605d5d..95888ae6c263 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value); + err = perf_event_read_local(ee->event, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi -- cgit v1.2.3 From 908432ca84fc229e906ba164219e9ad0fe56f755 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:20 -0700 Subject: bpf: add helper bpf_perf_event_read_value for perf event array map Hardware pmu counters are limited resources. When there are more pmu based perf events opened than available counters, kernel will multiplex these events so each event gets certain percentage (but not 100%) of the pmu time. In case that multiplexing happens, the number of samples or counter value will not reflect the case compared to no multiplexing. This makes comparison between different runs difficult. Typically, the number of samples or counter value should be normalized before comparing to other experiments. The typical normalization is done like: normalized_num_samples = num_samples * time_enabled / time_running normalized_counter_value = counter_value * time_enabled / time_running where time_enabled is the time enabled for event and time_running is the time running for event since last normalization. This patch adds helper bpf_perf_event_read_value for kprobed based perf event array map, to read perf counter and enabled/running time. The enabled/running time is accumulated since the perf event open. To achieve scaling factor between two bpf invocations, users can can use cpu_id as the key (which is typical for perf array usage model) to remember the previous value and do the calculation inside the bpf program. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 +++- kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52b022310f6a..590125e29161 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1552,7 +1552,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && - func_id != BPF_FUNC_perf_event_output) + func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_perf_event_read_value) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -1595,6 +1596,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: + case BPF_FUNC_perf_event_read_value: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 95888ae6c263..0be86cc0130e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +static __always_inline int +get_map_perf_counter(struct bpf_map *map, u64 flags, + u64 *value, u64 *enabled, u64 *running) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - u64 value = 0; - int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value, NULL, NULL); + return perf_event_read_local(ee->event, value, enabled, running); +} + +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +{ + u64 value = 0; + int err; + + err = get_map_perf_counter(map, flags, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi @@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_event_read_value_proto = { + .func = bpf_perf_event_read_value, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); static __always_inline u64 @@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_perf_event_read_value: + return &bpf_perf_event_read_value_proto; default: return tracing_func_proto(func_id); } -- cgit v1.2.3 From 4bebdc7a85aa400c0222b5329861e4ad9252f1e5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:22 -0700 Subject: bpf: add helper bpf_perf_prog_read_value This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf programs, to read event counter and enabled/running time. The enabled/running time is accumulated since the perf event open. The typical use case for perf event based bpf program is to attach itself to a single event. In such cases, if it is desirable to get scaling factor between two bpf invocations, users can can save the time values in a map, and use the value from the map and the current value to calculate the scaling factor. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0be86cc0130e..04ea5314f2bc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -613,6 +613,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { + .func = bpf_perf_prog_read_value_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -620,6 +646,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_perf_prog_read_value: + return &bpf_perf_prog_read_value_proto_tp; default: return tracing_func_proto(func_id); } -- cgit v1.2.3 From 8fe2d6ccd52b086268f2f36e5e2fc0fe3aeffa80 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 5 Oct 2017 16:20:56 -0700 Subject: bpf: fix liveness marking while processing Rx = Ry instruction the verifier does regs[insn->dst_reg] = regs[insn->src_reg] which often clears write mark (when Ry doesn't have it) that was just set by check_reg_arg(Rx) prior to the assignment. That causes mark_reg_read() to keep marking Rx in this block as REG_LIVE_READ (since the logic incorrectly misses that it's screened by the write) and in many of its parents (until lucky write into the same Rx or beginning of the program). That causes is_state_visited() logic to miss many pruning opportunities. Furthermore mark_reg_read() logic propagates the read mark for BPF_REG_FP as well (though it's readonly) which causes harmless but unnecssary work during is_state_visited(). Note that do_propagate_liveness() skips FP correctly, so do the same in mark_reg_read() as well. It saves 0.2 seconds for the test below program before after bpf_lb-DLB_L3.o 2604 2304 bpf_lb-DLB_L4.o 11159 3723 bpf_lb-DUNKNOWN.o 1116 1110 bpf_lxc-DDROP_ALL.o 34566 28004 bpf_lxc-DUNKNOWN.o 53267 39026 bpf_netdev.o 17843 16943 bpf_overlay.o 8672 7929 time ~11 sec ~4 sec Fixes: dc503a8ad984 ("bpf/verifier: track liveness for pruning") Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..8b8d6ba39e23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { struct bpf_verifier_state *parent = state->parent; + if (regno == BPF_REG_FP) + /* We don't need to worry about FP liveness because it's read-only */ + return; + while (parent) { /* if read wasn't screened by an earlier write ... */ if (state->regs[regno].live & REG_LIVE_WRITTEN) @@ -2345,6 +2349,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * copy register state to dest reg */ regs[insn->dst_reg] = regs[insn->src_reg]; + regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { -- cgit v1.2.3 From 473d97343f94ff20f5196078314e4dd83156d3a2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 5 Oct 2017 21:52:11 -0700 Subject: bpf: Change bpf_obj_name_cpy() to better ensure map's name is init by 0 During get_info_by_fd, the prog/map name is memcpy-ed. It depends on the prog->aux->name and map->name to be zero initialized. bpf_prog_aux is easy to guarantee that aux->name is zero init. The name in bpf_map may be harder to be guaranteed in the future when new map type is added. Hence, this patch makes bpf_obj_name_cpy() to always zero init the prog/map name. Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0048cb24ba7b..d124e702e040 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -322,6 +322,8 @@ static int bpf_obj_name_cpy(char *dst, const char *src) { const char *end = src + BPF_OBJ_NAME_LEN; + memset(dst, 0, BPF_OBJ_NAME_LEN); + /* Copy all isalnum() and '_' char */ while (src < end && *src) { if (!isalnum(*src) && *src != '_') @@ -333,9 +335,6 @@ static int bpf_obj_name_cpy(char *dst, const char *src) if (src == end) return -EINVAL; - /* '\0' terminates dst */ - *dst = 0; - return 0; } -- cgit v1.2.3 From 368211fb920a0b789c238942c6af0414539f79d6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 5 Oct 2017 21:52:13 -0700 Subject: bpf: Append prog->aux->name in bpf_get_prog_name() This patch makes the bpf_prog's name available in kallsyms. The new format is bpf_prog_tag[_name]. Sample kallsyms from running selftests/bpf/test_progs: [root@arch-fb-vm1 ~]# egrep ' bpf_prog_[0-9a-fA-F]{16}' /proc/kallsyms ffffffffa0048000 t bpf_prog_dabf0207d1992486_test_obj_id ffffffffa0038000 t bpf_prog_a04f5eef06a7f555__123456789ABCDE ffffffffa0050000 t bpf_prog_a04f5eef06a7f555 Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c6be15ae83ee..248961af2421 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { + const char *end = sym + KSYM_NAME_LEN; + BUILD_BUG_ON(sizeof("bpf_prog_") + - sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + sizeof(prog->tag) * 2 + + /* name has been null terminated. + * We should need +1 for the '_' preceding + * the name. However, the null character + * is double counted between the name and the + * sizeof("bpf_prog_") above, so we omit + * the +1 here. + */ + sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); - *sym = 0; + if (prog->aux->name[0]) + snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); + else + *sym = 0; } static __always_inline unsigned long -- cgit v1.2.3 From 19e1d4e947cac3b5e08225d15ad7744e691c7376 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Oct 2017 12:41:36 +0200 Subject: genirq: Warn when effective affinity is not updated Emit a one time warning when the effective affinity mask is enabled in Kconfig, but the interrupt chip does not update the mask in its irq_set_affinity() callback, Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos --- kernel/irq/manage.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d00132b5c325..ef89f7246656 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } +static void irq_validate_effective_affinity(struct irq_data *data) +{ +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + const struct cpumask *m = irq_data_get_effective_affinity_mask(data); + struct irq_chip *chip = irq_data_get_irq_chip(data); + + if (!cpumask_empty(m)) + return; + pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", + chip->name, data->irq); +#endif +} + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -181,6 +194,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: + irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); ret = 0; } -- cgit v1.2.3 From 60b09c51bb4fb46e2331fdbb39f91520f31d35f7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Oct 2017 12:47:24 +0200 Subject: genirq/cpuhotplug: Add sanity check for effective affinity mask The effective affinity mask handling has no safety net when the mask is not updated by the interrupt chip or the mask contains offline CPUs. If that happens the CPU unplug code fails to migrate interrupts. Add sanity checks and emit a warning when the mask contains only offline CPUs. Fixes: 415fcf1a2293 ("genirq/cpuhotplug: Use effective affinity mask") Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Christoph Hellwig Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos --- kernel/irq/cpuhotplug.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 638eb9c83d9f..9eb09aef0313 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -18,8 +18,34 @@ static inline bool irq_needs_fixup(struct irq_data *d) { const struct cpumask *m = irq_data_get_effective_affinity_mask(d); + unsigned int cpu = smp_processor_id(); - return cpumask_test_cpu(smp_processor_id(), m); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + /* + * The cpumask_empty() check is a workaround for interrupt chips, + * which do not implement effective affinity, but the architecture has + * enabled the config switch. Use the general affinity mask instead. + */ + if (cpumask_empty(m)) + m = irq_data_get_affinity_mask(d); + + /* + * Sanity check. If the mask is not empty when excluding the outgoing + * CPU then it must contain at least one online CPU. The outgoing CPU + * has been removed from the online mask already. + */ + if (cpumask_any_but(m, cpu) < nr_cpu_ids && + cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) { + /* + * If this happens then there was a missed IRQ fixup at some + * point. Warn about it and enforce fixup. + */ + pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n", + cpumask_pr_args(m), d->irq, cpu); + return true; + } +#endif + return cpumask_test_cpu(cpu, m); } static bool migrate_one_irq(struct irq_desc *desc) -- cgit v1.2.3 From e43b3b58548051f8809391eb7bec7a27ed3003ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Oct 2017 21:07:38 +0200 Subject: genirq/cpuhotplug: Enforce affinity setting on startup of managed irqs Managed interrupts can end up in a stale state on CPU hotplug. If the interrupt is not targeting a single CPU, i.e. the affinity mask spawns multiple CPUs then the following can happen: After boot: dstate: 0x01601200 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 24 pending: 0 After offlining CPU 31 - 24 dstate: 0x01a31000 IRQD_IRQ_DISABLED IRQD_IRQ_MASKED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_AFFINITY_MANAGED IRQD_MANAGED_SHUTDOWN node: 0 affinity: 24-31 effectiv: 24 pending: 0 Now CPU 25 gets onlined again, so it should get the effective interrupt affinity for this interruopt, but due to the x86 interrupt affinity setter restrictions this ends up after restarting the interrupt with: dstate: 0x01601300 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_SETAFFINITY_PENDING IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 24 pending: 24-31 So the interrupt is still affine to CPU 24, which was the last CPU to go offline of that affinity set and the move to an online CPU within 24-31, in this case 25, is pending. This mechanism is x86/ia64 specific as those architectures cannot move interrupts from thread context and do this when an interrupt is actually handled. So the move is set to pending. Whats worse is that offlining CPU 25 again results in: dstate: 0x01601300 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_SETAFFINITY_PENDING IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 24 pending: 24-31 This means the interrupt has not been shut down, because the outgoing CPU is not in the effective affinity mask, but of course nothing notices that the effective affinity mask is pointing at an offline CPU. In the case of restarting a managed interrupt the move restriction does not apply, so the affinity setting can be made unconditional. This needs to be done _before_ the interrupt is started up as otherwise the condition for moving it from thread context would not longer be fulfilled. With that change applied onlining CPU 25 after offlining 31-24 results in: dstate: 0x01600200 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 25 pending: And after offlining CPU 25: dstate: 0x01a30000 IRQD_IRQ_DISABLED IRQD_IRQ_MASKED IRQD_SINGLE_TARGET IRQD_AFFINITY_MANAGED IRQD_MANAGED_SHUTDOWN node: 0 affinity: 24-31 effectiv: 25 pending: which is the correct and expected result. Fixes: 761ea388e8c4 ("genirq: Handle managed irqs gracefully in irq_startup()") Reported-by: YASUAKI ISHIMATSU Signed-off-by: Thomas Gleixner Cc: axboe@kernel.dk Cc: linux-scsi@vger.kernel.org Cc: Sumit Saxena Cc: Marc Zyngier Cc: mpe@ellerman.id.au Cc: Shivasharan Srikanteshwara Cc: Kashyap Desai Cc: keith.busch@intel.com Cc: peterz@infradead.org Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos --- kernel/irq/chip.c | 2 +- kernel/irq/manage.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6fc89fd93824..5a2ef92c2782 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) irq_setup_affinity(desc); break; case IRQ_STARTUP_MANAGED: + irq_do_set_affinity(d, aff, false); ret = __irq_startup(desc); - irq_set_affinity_locked(d, aff, false); break; case IRQ_STARTUP_ABORT: return 0; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef89f7246656..4bff6a10ae8e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -188,6 +188,9 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_chip *chip = irq_data_get_irq_chip(data); int ret; + if (!chip || !chip->irq_set_affinity) + return -EINVAL; + ret = chip->irq_set_affinity(data, mask, force); switch (ret) { case IRQ_SET_MASK_OK: -- cgit v1.2.3 From 98589a0998b8b13c4a8fa1ccb0e62751a019faa5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 9 Oct 2017 15:27:15 +0300 Subject: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1' Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced support for attaching an eBPF object by an fd, with the 'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each IPT_SO_SET_REPLACE call. However this breaks subsequent iptables calls: # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT # iptables -A INPUT -s 5.6.7.8 -j ACCEPT iptables: Invalid argument. Run `dmesg' for more information. That's because iptables works by loading existing rules using IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with the replacement set. However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number (from the initial "iptables -m bpf" invocation) - so when 2nd invocation occurs, userspace passes a bogus fd number, which leads to 'bpf_mt_check_v1' to fail. One suggested solution [1] was to hack iptables userspace, to perform a "entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new, process-local fd per every 'xt_bpf_info_v1' entry seen. However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects. This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given '.fd' and instead perform an in-kernel lookup for the bpf object given the provided '.path'. It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is expected to provide the path of the pinned object. Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved. References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2 [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2 Reported-by: Rafael Buchbinder Signed-off-by: Shmulik Ladkani Acked-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- kernel/bpf/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e833ed914358..be1dde967208 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -363,6 +363,7 @@ out: putname(pname); return ret; } +EXPORT_SYMBOL_GPL(bpf_obj_get_user); static void bpf_evict_inode(struct inode *inode) { -- cgit v1.2.3 From 135bd1a230bb69a68c9808a7d25467318900b80a Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 7 Aug 2017 11:20:10 +0530 Subject: rcu: Fix up pending cbs check in rcu_prepare_for_idle The pending-callbacks check in rcu_prepare_for_idle() is backwards. It should accelerate if there are pending callbacks, but the check rather uselessly accelerates only if there are no callbacks. This commit therefore inverts this check. Fixes: 15fecf89e46a ("srcu: Abstract multi-tail callback list handling") Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Cc: # 4.12.x --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..fed95fa941e6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1507,7 +1507,7 @@ static void rcu_prepare_for_idle(void) rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (rcu_segcblist_pend_cbs(&rdp->cblist)) + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) continue; rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ -- cgit v1.2.3 From c63eb17ff06dbcf73e771b9b425c531cc0a9c17b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Aug 2017 12:37:07 -0700 Subject: rcu: Create call_rcu_tasks() kthread at boot time Currently the call_rcu_tasks() kthread is created upon first invocation of call_rcu_tasks(). This has the advantage of avoiding creation if there are never any invocations of call_rcu_tasks() and of synchronize_rcu_tasks(), but it requires an unreliable heuristic to determine when it is safe to create the kthread. For example, it is not safe to create the kthread when call_rcu_tasks() is invoked with a spinlock held, but there is no good way to detect this in !PREEMPT kernels. This commit therefore creates this kthread unconditionally at core_initcall() time. If you don't want this kthread created, then build with CONFIG_TASKS_RCU=n. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5033b66d2753..e9bbedbb8745 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -575,7 +575,6 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); -static void rcu_spawn_tasks_kthread(void); static struct task_struct *rcu_tasks_kthread_ptr; /** @@ -600,7 +599,6 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; - bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); rhp->next = NULL; rhp->func = func; @@ -610,11 +608,8 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); /* We can't create the thread unless interrupts are enabled. */ - if ((needwake && havetask) || - (!havetask && !irqs_disabled_flags(flags))) { - rcu_spawn_tasks_kthread(); + if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) wake_up(&rcu_tasks_cbs_wq); - } } EXPORT_SYMBOL_GPL(call_rcu_tasks); @@ -853,27 +848,18 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */ -static void rcu_spawn_tasks_kthread(void) +/* Spawn rcu_tasks_kthread() at core_initcall() time. */ +static int __init rcu_spawn_tasks_kthread(void) { - static DEFINE_MUTEX(rcu_tasks_kthread_mutex); struct task_struct *t; - if (READ_ONCE(rcu_tasks_kthread_ptr)) { - smp_mb(); /* Ensure caller sees full kthread. */ - return; - } - mutex_lock(&rcu_tasks_kthread_mutex); - if (rcu_tasks_kthread_ptr) { - mutex_unlock(&rcu_tasks_kthread_mutex); - return; - } t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); BUG_ON(IS_ERR(t)); smp_mb(); /* Ensure others see full kthread. */ WRITE_ONCE(rcu_tasks_kthread_ptr, t); - mutex_unlock(&rcu_tasks_kthread_mutex); + return 0; } +core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) -- cgit v1.2.3 From 6733bab7bc09b67668028dab562caea1b4ff3c69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Aug 2017 10:59:16 -0700 Subject: irq_work: Map irq_work_on_queue() to irq_work_on() in !SMP Commit 478850160636 ("irq_work: Implement remote queueing") provides irq_work_on_queue() only for SMP builds. However, providing it simplifies code that submits irq_work to lists of CPUs, eliminating the !SMP special cases. This commit therefore maps irq_work_on_queue() to irq_work_on() in !SMP builds, but validating the specified CPU. Signed-off-by: Paul E. McKenney --- kernel/irq_work.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index bcf107ce0854..9f20f6c72579 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -56,7 +56,6 @@ void __weak arch_irq_work_raise(void) */ } -#ifdef CONFIG_SMP /* * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. @@ -68,6 +67,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(cpu)); +#ifdef CONFIG_SMP + /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); @@ -78,10 +79,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) arch_send_call_function_single_ipi(cpu); +#else /* #ifdef CONFIG_SMP */ + irq_work_queue(work); +#endif /* #else #ifdef CONFIG_SMP */ + return true; } -EXPORT_SYMBOL_GPL(irq_work_queue_on); -#endif /* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) -- cgit v1.2.3 From 7c2102e56a3f7d85b5d8f33efbd7aecc1f36fdd8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Sep 2017 08:54:40 -0700 Subject: sched: Make resched_cpu() unconditional The current implementation of synchronize_sched_expedited() incorrectly assumes that resched_cpu() is unconditional, which it is not. This means that synchronize_sched_expedited() can hang when resched_cpu()'s trylock fails as follows (analysis by Neeraj Upadhyay): o CPU1 is waiting for expedited wait to complete: sync_rcu_exp_select_cpus rdp->exp_dynticks_snap & 0x1 // returns 1 for CPU5 IPI sent to CPU5 synchronize_sched_expedited_wait ret = swait_event_timeout(rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); expmask = 0x20, CPU 5 in idle path (in cpuidle_enter()) o CPU5 handles IPI and fails to acquire rq lock. Handles IPI sync_sched_exp_handler resched_cpu returns while failing to try lock acquire rq->lock need_resched is not set o CPU5 calls rcu_idle_enter() and as need_resched is not set, goes to idle (schedule() is not called). o CPU 1 reports RCU stall. Given that resched_cpu() is now used only by RCU, this commit fixes the assumption by making resched_cpu() unconditional. Reported-by: Neeraj Upadhyay Suggested-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..8fa7b6f9e19b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,8 +505,7 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!raw_spin_trylock_irqsave(&rq->lock, flags)) - return; + raw_spin_lock_irqsave(&rq->lock, flags); resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.2.3 From f79c3ad6189624c3de0ad5521610c9e22a1c33cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Nov 2016 06:24:30 -0800 Subject: sched,rcu: Make cond_resched() provide RCU quiescent state There is some confusion as to which of cond_resched() or cond_resched_rcu_qs() should be added to long in-kernel loops. This commit therefore eliminates the decision by adding RCU quiescent states to cond_resched(). This commit also simplifies the code that used to interact with cond_resched_rcu_qs(), and that now interacts with cond_resched(), to reduce its overhead. This reduction is necessary to allow the heavier-weight cond_resched_rcu_qs() mechanism to be invoked everywhere that cond_resched() is invoked. Part of that reduction in overhead converts the jiffies_till_sched_qs kernel parameter to read-only at runtime, thus eliminating the need for bounds checking. Reported-by: Michal Hocko Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra [ paulmck: Keep PREEMPT=n cond_resched a no-op, per Peter Zijlstra. ] --- kernel/rcu/tree.c | 25 +++++-------------------- kernel/sched/core.c | 1 + 2 files changed, 6 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0ad62b0e7b8..0dda57a28276 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -534,8 +534,8 @@ module_param(rcu_kick_kthreads, bool, 0644); * How long the grace period must be before we start recruiting * quiescent-state help from rcu_note_context_switch(). */ -static ulong jiffies_till_sched_qs = HZ / 20; -module_param(jiffies_till_sched_qs, ulong, 0644); +static ulong jiffies_till_sched_qs = HZ / 10; +module_param(jiffies_till_sched_qs, ulong, 0444); static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); @@ -1235,7 +1235,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) unsigned long jtsq; bool *rnhqp; bool *ruqp; - unsigned long rjtsc; struct rcu_node *rnp; /* @@ -1252,23 +1251,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* Compute and saturate jiffies_till_sched_qs. */ - jtsq = jiffies_till_sched_qs; - rjtsc = rcu_jiffies_till_stall_check(); - if (jtsq > rjtsc / 2) { - WRITE_ONCE(jiffies_till_sched_qs, rjtsc); - jtsq = rjtsc / 2; - } else if (jtsq < 1) { - WRITE_ONCE(jiffies_till_sched_qs, 1); - jtsq = 1; - } - /* * Has this CPU encountered a cond_resched_rcu_qs() since the * beginning of the grace period? For this to be the case, * the CPU has to have noticed the current grace period. This * might not be the case for nohz_full CPUs looping in the kernel. */ + jtsq = jiffies_till_sched_qs; rnp = rdp->mynode; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && @@ -1276,7 +1265,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); return 1; - } else { + } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ smp_store_release(ruqp, true); } @@ -1304,10 +1293,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * updates are only once every few jiffies, the probability of * lossage (and thus of slight grace-period extension) is * quite low. - * - * Note that if the jiffies_till_sched_qs boot/sysfs parameter - * is set too high, we override with half of the RCU CPU stall - * warning delay. */ rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && @@ -1316,7 +1301,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ + rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */ } /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..904d3ab35c83 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4842,6 +4842,7 @@ int __sched _cond_resched(void) preempt_schedule_common(); return 1; } + rcu_all_qs(); return 0; } EXPORT_SYMBOL(_cond_resched); -- cgit v1.2.3 From 9b9500da81502738efa1b485a8835f174ff7be6d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Aug 2017 17:05:59 -0700 Subject: rcu: Make RCU CPU stall warnings check for irq-disabled CPUs One common question upon seeing an RCU CPU stall warning is "did the stalled CPUs have interrupts disabled?" However, the current stall warnings are silent on this point. This commit therefore uses irq_work to check whether stalled CPUs still respond to IPIs, and flags this state in the RCU CPU stall warning console messages. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 104 +++++++++++++++++++++++++++++++++++++++++------ kernel/rcu/tree.h | 5 +++ kernel/rcu/tree_plugin.h | 7 +++- 3 files changed, 103 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0dda57a28276..12838a9a128e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1206,6 +1206,22 @@ static int rcu_is_cpu_rrupt_from_idle(void) return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; } +/* + * We are reporting a quiescent state on behalf of some other CPU, so + * it is our responsibility to check for and handle potential overflow + * of the rcu_node ->gpnum counter with respect to the rcu_data counters. + * After all, the CPU might be in deep idle state, and thus executing no + * code whatsoever. + */ +static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) +{ + lockdep_assert_held(&rnp->lock); + if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) + WRITE_ONCE(rdp->gpwrap, true); + if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) + rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; +} + /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU @@ -1216,14 +1232,33 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, - rdp->mynode->gpnum)) - WRITE_ONCE(rdp->gpwrap, true); + rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } return 0; } +/* + * Handler for the irq_work request posted when a grace period has + * gone on for too long, but not yet long enough for an RCU CPU + * stall warning. Set state appropriately, but just complain if + * there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = container_of(iwp, struct rcu_data, rcu_iw); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); + if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { + rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_pending = false; + } + raw_spin_unlock_rcu_node(rnp); +} + /* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks @@ -1235,7 +1270,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) unsigned long jtsq; bool *rnhqp; bool *ruqp; - struct rcu_node *rnp; + struct rcu_node *rnp = rdp->mynode; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1248,6 +1283,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; + rcu_gpnum_ovf(rnp, rdp); return 1; } @@ -1258,12 +1294,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * might not be the case for nohz_full CPUs looping in the kernel. */ jtsq = jiffies_till_sched_qs; - rnp = rdp->mynode; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + rcu_gpnum_ovf(rnp, rdp); return 1; } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ @@ -1274,6 +1310,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; + rcu_gpnum_ovf(rnp, rdp); return 1; } @@ -1305,11 +1342,22 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * If more than halfway to RCU CPU stall-warning time, do - * a resched_cpu() to try to loosen things up a bit. + * If more than halfway to RCU CPU stall-warning time, do a + * resched_cpu() to try to loosen things up a bit. Also check to + * see if the CPU is getting hammered with interrupts, but only + * once per grace period, just to keep the IPIs down to a dull roar. */ - if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) + if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { resched_cpu(rdp->cpu); + if (IS_ENABLED(CONFIG_IRQ_WORK) && + !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum && + (rnp->ffmask & rdp->grpmask)) { + init_irq_work(&rdp->rcu_iw, rcu_iw_handler); + rdp->rcu_iw_pending = true; + rdp->rcu_iw_gpnum = rnp->gpnum; + irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); + } + } return 0; } @@ -1498,6 +1546,7 @@ static void print_cpu_stall(struct rcu_state *rsp) { int cpu; unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1513,7 +1562,9 @@ static void print_cpu_stall(struct rcu_state *rsp) */ pr_err("INFO: %s self-detected stall on CPU", rsp->name); print_cpu_stall_info_begin(); + raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); print_cpu_stall_info(rsp, smp_processor_id()); + raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, @@ -1907,6 +1958,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); + rcu_gpnum_ovf(rnp, rdp); } return ret; } @@ -3685,6 +3737,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; + rdp->rcu_iw_pending = false; + rdp->rcu_iw_gpnum = rnp->gpnum - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -3722,10 +3776,24 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) */ int rcutree_online_cpu(unsigned int cpu) { - sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask |= rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_online_cpu(cpu); + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return 0; /* Too early in boot for scheduler work. */ + sync_sched_exp_online_cleanup(cpu); + rcutree_affinity_setting(cpu, -1); return 0; } @@ -3735,6 +3803,19 @@ int rcutree_online_cpu(unsigned int cpu) */ int rcutree_offline_cpu(unsigned int cpu) { + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + rcutree_affinity_setting(cpu, cpu); if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_offline_cpu(cpu); @@ -4183,8 +4264,7 @@ void __init rcu_init(void) for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); rcu_cpu_starting(cpu); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_online_cpu(cpu); + rcutree_online_cpu(cpu); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e1f285f0a70..46a5d1991450 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -103,6 +103,7 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -285,6 +286,10 @@ struct rcu_data { /* 8) RCU CPU stall data. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ + /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ + struct irq_work rcu_iw; /* Check for non-irq activity. */ + bool rcu_iw_pending; /* Is ->rcu_iw pending? */ + unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..14977d0470d1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1671,6 +1671,7 @@ static void print_cpu_stall_info_begin(void) */ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) { + unsigned long delta; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = rdp->dynticks; @@ -1685,11 +1686,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : + rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : + "!."[!delta], ticks_value, ticks_title, rcu_dynticks_snap(rdtp) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, -- cgit v1.2.3 From 83b6ca1fede773eebcdfb44f5a94eb410d48b886 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Aug 2017 16:47:08 -0700 Subject: rcu: Turn off tracing before dumping trace Currently, RCU allows tracing to continue when it automatically does ftrace_dump() after detecting an error condition, which can result in excessively large traces and lost trace events. This commit therefore does a tracing_off() before any of these ftrace_dump() calls. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index e4b43fef89f5..b8729eb09a5d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -220,8 +220,10 @@ do { \ static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ \ if (!atomic_read(&___rfd_beenhere) && \ - !atomic_xchg(&___rfd_beenhere, 1)) \ + !atomic_xchg(&___rfd_beenhere, 1)) { \ + tracing_off(); \ ftrace_dump(oops_dump_mode); \ + } \ } while (0) void rcu_early_boot_tests(void); -- cgit v1.2.3 From f22ce09157239aab08eae99c678ef664f71a9097 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Sep 2017 14:40:54 -0700 Subject: rcu: Suppress RCU CPU stall warnings while dumping trace Currently, RCU emits Suppress RCU CPU stall warnings during its automatically initiated ftrace_dump() calls after detecting an error condition, which can result in excessively excessive console output and lost trace events. This commit therefore suppresses RCU CPU stall warnings across any of these ftrace_dump() calls. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 17 +++++++++++++++++ kernel/rcu/update.c | 1 + 2 files changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index b8729eb09a5d..59c471de342a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -203,6 +203,21 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) extern int rcu_cpu_stall_suppress; int rcu_jiffies_till_stall_check(void); +#define rcu_ftrace_dump_stall_suppress() \ +do { \ + if (!rcu_cpu_stall_suppress) \ + rcu_cpu_stall_suppress = 3; \ +} while (0) + +#define rcu_ftrace_dump_stall_unsuppress() \ +do { \ + if (rcu_cpu_stall_suppress == 3) \ + rcu_cpu_stall_suppress = 0; \ +} while (0) + +#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */ +#define rcu_ftrace_dump_stall_suppress() +#define rcu_ftrace_dump_stall_unsuppress() #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ /* @@ -222,7 +237,9 @@ do { \ if (!atomic_read(&___rfd_beenhere) && \ !atomic_xchg(&___rfd_beenhere, 1)) { \ tracing_off(); \ + rcu_ftrace_dump_stall_suppress(); \ ftrace_dump(oops_dump_mode); \ + rcu_ftrace_dump_stall_unsuppress(); \ } \ } while (0) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5033b66d2753..3dc8efb16dc7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -494,6 +494,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #endif int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); -- cgit v1.2.3 From 2b1516e55f8416acfb48d5f43d41222d180fb5a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Aug 2017 16:11:37 -0700 Subject: rcutorture: Add interrupt-disable capability to stall-warning tests When rcutorture sees the rcutorture.stall_cpu kernel boot parameter, it loops with preemption disabled, which does in fact normally generate an RCU CPU stall warning message. However, there are test scenarios that need the stalling CPU to have interrupts disabled. This commit therefore adds an rcutorture.stall_cpu_irqsoff kernel boot parameter that causes the stalling CPU to disable interrupts. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 45f2ffbc1e78..0273bc0a8586 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -89,6 +89,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); +torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -1357,7 +1358,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " - "stall_cpu=%d stall_cpu_holdoff=%d " + "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -1365,7 +1366,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, - stall_cpu, stall_cpu_holdoff, + stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, n_barrier_cbs, onoff_interval, onoff_holdoff); } @@ -1430,12 +1431,19 @@ static int rcu_torture_stall(void *args) if (!kthread_should_stop()) { stop_at = get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - pr_alert("rcu_torture_stall start.\n"); rcu_read_lock(); - preempt_disable(); + if (stall_cpu_irqsoff) + local_irq_disable(); + else + preempt_disable(); + pr_alert("rcu_torture_stall start on CPU %d.\n", + smp_processor_id()); while (ULONG_CMP_LT(get_seconds(), stop_at)) continue; /* Induce RCU CPU stall warning. */ - preempt_enable(); + if (stall_cpu_irqsoff) + local_irq_enable(); + else + preempt_enable(); rcu_read_unlock(); pr_alert("rcu_torture_stall end.\n"); } -- cgit v1.2.3 From 0032f4e889764d22ccccb6a15742071d6f0d1f5a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Aug 2017 10:40:17 -0700 Subject: rcutorture: Dump writer stack if stalled Right now, rcutorture warns if an rcu_torture_writer() kthread stalls, but this warning is not always all that helpful. This commit therefore makes the first such warning include a stack dump. This in turn requires that sched_show_task() be exported to GPL modules, so this commit makes that change as well. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 ++++++ kernel/sched/core.c | 1 + 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0273bc0a8586..362eb2f78b3c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "rcu.h" @@ -1240,6 +1241,7 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; static unsigned long rtcv_snap = ULONG_MAX; + static bool splatted; struct task_struct *wtp; for_each_possible_cpu(cpu) { @@ -1325,6 +1327,10 @@ rcu_torture_stats_print(void) gpnum, completed, flags, wtp == NULL ? ~0UL : wtp->state, wtp == NULL ? -1 : (int)task_cpu(wtp)); + if (!splatted && wtp) { + sched_show_task(wtp); + splatted = true; + } show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..7ae0151dcc1d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5165,6 +5165,7 @@ void sched_show_task(struct task_struct *p) show_stack(p, NULL); put_task_stack(p); } +EXPORT_SYMBOL_GPL(sched_show_task); static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p) -- cgit v1.2.3 From 96ca579a1ecc943b75beba58bebb0356f6cc4b51 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 Oct 2017 11:36:52 -0700 Subject: waitid(): Add missing access_ok() checks Adds missing access_ok() checks. CVE-2017-5123 Reported-by: Chris Salls Signed-off-by: Kees Cook Acked-by: Al Viro Fixes: 4c48abe91be0 ("waitid(): switch copyout of siginfo to unsafe_put_user()") Cc: stable@kernel.org # 4.13 Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f2cd53e92147..cf28528842bc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; + if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) + goto Efault; + user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); @@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; + if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) + goto Efault; + user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); -- cgit v1.2.3 From fbb1fb4ad415cb31ce944f65a5ca700aaf73a227 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Oct 2017 21:44:52 -0700 Subject: net: defer call to cgroup_sk_alloc() sk_clone_lock() might run while TCP/DCCP listener already vanished. In order to prevent use after free, it is better to defer cgroup_sk_alloc() to the point we know both parent and child exist, and from process context. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Eric Dumazet Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: David S. Miller --- kernel/cgroup/cgroup.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 44857278eb8a..3380a3e49af5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5709,17 +5709,6 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; - /* Socket clone path */ - if (skcd->val) { - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - return; - } - rcu_read_lock(); while (true) { -- cgit v1.2.3 From 8b405d5c5d0996d3d16f70c42744a0500f5b6ec3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Oct 2017 11:13:37 +0200 Subject: locking/lockdep: Fix stacktrace mess There is some complication between check_prevs_add() and check_prev_add() wrt. saving stack traces. The problem is that we want to be frugal with saving stack traces, since it consumes static resources. We'll only know in check_prev_add() if we need the trace, but we can call into it multiple times. So we want to do on-demand and re-use. A further complication is that check_prev_add() can drop graph_lock and mess with our static resources. In any case, the current state; after commit: ce07a9415f26 ("locking/lockdep: Make check_prev_add() able to handle external stack_trace") is that we'll assume the trace contains valid data once check_prev_add() returns '2'. However, as noted by Josh, this is false, check_prev_add() can return '2' before having saved a trace, this then result in the possibility of using uninitialized data. Testing, as reported by Wu, shows a NULL deref. So simplify. Since the graph_lock() thing is a debug path that hasn't really been used in a long while, take it out back and avoid the head-ache. Further initialize the stack_trace to a known 'empty' state; as long as nr_entries == 0, nothing should deref entries. We can then use the 'entries == NULL' test for a valid trace / on-demand saving. Analyzed-by: Josh Poimboeuf Reported-by: Fengguang Wu Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ce07a9415f26 ("locking/lockdep: Make check_prev_add() able to handle external stack_trace") Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 48 ++++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 44c8d0d17170..e36e652d996f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, struct held_lock *next, int distance, struct stack_trace *trace, int (*save)(struct stack_trace *trace)) { + struct lock_list *uninitialized_var(target_entry); struct lock_list *entry; - int ret; struct lock_list this; - struct lock_list *uninitialized_var(target_entry); + int ret; /* * Prove that the new -> dependency would not @@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, this.class = hlock_class(next); this.parent = NULL; ret = check_noncircular(&this, hlock_class(prev), &target_entry); - if (unlikely(!ret)) + if (unlikely(!ret)) { + if (!trace->entries) { + /* + * If @save fails here, the printing might trigger + * a WARN but because of the !nr_entries it should + * not do bad things. + */ + save(trace); + } return print_circular_bug(&this, target_entry, next, prev, trace); + } else if (unlikely(ret < 0)) return print_bfs_bug(ret); @@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return print_bfs_bug(ret); - if (save && !save(trace)) + if (!trace->entries && !save(trace)) return 0; /* @@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (!ret) return 0; - /* - * Debugging printouts: - */ - if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - graph_unlock(); - printk("\n new dependency: "); - print_lock_name(hlock_class(prev)); - printk(KERN_CONT " => "); - print_lock_name(hlock_class(next)); - printk(KERN_CONT "\n"); - dump_stack(); - if (!graph_lock()) - return 0; - } return 2; } @@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; struct held_lock *hlock; - struct stack_trace trace; - int (*save)(struct stack_trace *trace) = save_trace; + struct stack_trace trace = { + .nr_entries = 0, + .max_entries = 0, + .entries = NULL, + .skip = 0, + }; /* * Debugging checks. @@ -2018,17 +2017,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) */ if (hlock->read != 2 && hlock->check) { int ret = check_prev_add(curr, hlock, next, - distance, &trace, save); + distance, &trace, save_trace); if (!ret) return 0; - /* - * Stop saving stack_trace if save_trace() was - * called at least once: - */ - if (save && ret == 2) - save = NULL; - /* * Stop after the first non-trylock entry, * as non-trylock entries have added their -- cgit v1.2.3 From df0062b27ebf473b372914a3e3574d93790e2b72 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 3 Oct 2017 15:20:50 +0100 Subject: perf/core: Avoid freeing static PMU contexts when PMU is unregistered Since commit: 1fd7e4169954 ("perf/core: Remove perf_cpu_context::unique_pmu") ... when a PMU is unregistered then its associated ->pmu_cpu_context is unconditionally freed. Whilst this is fine for dynamically allocated context types (i.e. those registered using perf_invalid_context), this causes a problem for sharing of static contexts such as perf_{sw,hw}_context, which are used by multiple built-in PMUs and effectively have a global lifetime. Whilst testing the ARM SPE driver, which must use perf_sw_context to support per-task AUX tracing, unregistering the driver as a result of a module unload resulted in: Unable to handle kernel NULL pointer dereference at virtual address 00000038 Internal error: Oops: 96000004 [#1] PREEMPT SMP Modules linked in: [last unloaded: arm_spe_pmu] PC is at ctx_resched+0x38/0xe8 LR is at perf_event_exec+0x20c/0x278 [...] ctx_resched+0x38/0xe8 perf_event_exec+0x20c/0x278 setup_new_exec+0x88/0x118 load_elf_binary+0x26c/0x109c search_binary_handler+0x90/0x298 do_execveat_common.isra.14+0x540/0x618 SyS_execve+0x38/0x48 since the software context has been freed and the ctx.pmu->pmu_disable_count field has been set to NULL. This patch fixes the problem by avoiding the freeing of static PMU contexts altogether. Whilst the sharing of dynamic contexts is questionable, this actually requires the caller to share their context pointer explicitly and so the burden is on them to manage the object lifetime. Reported-by: Kim Phillips Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1fd7e4169954 ("perf/core: Remove perf_cpu_context::unique_pmu") Link: http://lkml.kernel.org/r/1507040450-7730-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..243bfc68d0fe 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8955,6 +8955,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) static void free_pmu_context(struct pmu *pmu) { + /* + * Static contexts such as perf_sw_context have a global lifetime + * and may be shared between different PMUs. Avoid freeing them + * when a single PMU is going away. + */ + if (pmu->task_ctx_nr > perf_invalid_context) + return; + mutex_lock(&pmus_lock); free_percpu(pmu->pmu_cpu_context); mutex_unlock(&pmus_lock); -- cgit v1.2.3 From e6a5203399d19871021c1fa0eb2a08fc63b67e91 Mon Sep 17 00:00:00 2001 From: "leilei.lin" Date: Fri, 29 Sep 2017 13:54:44 +0800 Subject: perf/core: Fix cgroup time when scheduling descendants Update cgroup time when an event is scheduled in by descendants. Reviewed-and-tested-by: Jiri Olsa Signed-off-by: leilei.lin Signed-off-by: Peter Zijlstra (Intel) Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: brendan.d.gregg@gmail.com Cc: yang_oliver@hotmail.com Link: http://lkml.kernel.org/r/CALPjY3mkHiekRkRECzMi9G-bjUQOvOjVBAqxmWkTzc-g+0LwMg@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 243bfc68d0fe..9d93db81fa36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgrp == event->cgrp) + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) __update_cgrp_time(event->cgrp); } -- cgit v1.2.3 From d153b153446f7d8832bb2ebd92309c8a6003b3bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Sep 2017 11:35:30 +0200 Subject: sched/core: Fix wake_affine() performance regression Eric reported a sysbench regression against commit: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Similarly, Rik was looking at the NAS-lu.C benchmark, which regressed against his v3.10 enterprise kernel. PRE (current tip/master): ivb-ep sysbench: 2: [30 secs] transactions: 64110 (2136.94 per sec.) 5: [30 secs] transactions: 143644 (4787.99 per sec.) 10: [30 secs] transactions: 274298 (9142.93 per sec.) 20: [30 secs] transactions: 418683 (13955.45 per sec.) 40: [30 secs] transactions: 320731 (10690.15 per sec.) 80: [30 secs] transactions: 355096 (11834.28 per sec.) hsw-ex NAS: OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds = 18.01 OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds = 17.89 OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds = 17.93 lu.C.x_threads_144_run_1.log: Time in seconds = 434.68 lu.C.x_threads_144_run_2.log: Time in seconds = 405.36 lu.C.x_threads_144_run_3.log: Time in seconds = 433.83 POST (+patch): ivb-ep sysbench: 2: [30 secs] transactions: 64494 (2149.75 per sec.) 5: [30 secs] transactions: 145114 (4836.99 per sec.) 10: [30 secs] transactions: 278311 (9276.69 per sec.) 20: [30 secs] transactions: 437169 (14571.60 per sec.) 40: [30 secs] transactions: 669837 (22326.73 per sec.) 80: [30 secs] transactions: 631739 (21055.88 per sec.) hsw-ex NAS: lu.C.x_threads_144_run_1.log: Time in seconds = 23.36 lu.C.x_threads_144_run_2.log: Time in seconds = 22.96 lu.C.x_threads_144_run_3.log: Time in seconds = 22.52 This patch takes out all the shiny wake_affine() stuff and goes back to utter basics. Between the two CPUs involved with the wakeup (the CPU doing the wakeup and the CPU we ran on previously) pick the CPU we can run on _now_. This restores much of the regressions against the older kernels, but leaves some ground in the overloaded case. The default-enabled WA_WEIGHT (which will be introduced in the next patch) is an attempt to address the overloaded situation. Reported-by: Eric Farman Signed-off-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Linus Torvalds Cc: Matthew Rosato Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: jinpuwang@gmail.com Cc: vcaputo@pengaru.com Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 126 ++++++------------------------------------------ kernel/sched/features.h | 1 + 2 files changed, 16 insertions(+), 111 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..28cabed85387 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p) return 1; } -struct llc_stats { - unsigned long nr_running; - unsigned long load; - unsigned long capacity; - int has_capacity; -}; - -static bool get_llc_stats(struct llc_stats *stats, int cpu) -{ - struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - - if (!sds) - return false; - - stats->nr_running = READ_ONCE(sds->nr_running); - stats->load = READ_ONCE(sds->load); - stats->capacity = READ_ONCE(sds->capacity); - stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); - - return true; -} - /* - * Can a task be moved from prev_cpu to this_cpu without causing a load - * imbalance that would trigger the load balancer? + * The purpose of wake_affine() is to quickly determine on which CPU we can run + * soonest. For the purpose of speed we only consider the waking and previous + * CPU. * - * Since we're running on 'stale' values, we might in fact create an imbalance - * but recomputing these values is expensive, as that'd mean iteration 2 cache - * domains worth of CPUs. + * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or + * will be) idle. */ + static bool -wake_affine_llc(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int prev_cpu, int sync) +wake_affine_idle(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) { - struct llc_stats prev_stats, this_stats; - s64 this_eff_load, prev_eff_load; - unsigned long task_load; - - if (!get_llc_stats(&prev_stats, prev_cpu) || - !get_llc_stats(&this_stats, this_cpu)) - return false; - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current LLC. - */ - if (sync) { - unsigned long current_load = task_h_load(current); - - /* in this case load hits 0 and this LLC is considered 'idle' */ - if (current_load > this_stats.load) - return true; - - this_stats.load -= current_load; - } - - /* - * The has_capacity stuff is not SMT aware, but by trying to balance - * the nr_running on both ends we try and fill the domain at equal - * rates, thereby first consuming cores before siblings. - */ - - /* if the old cache has capacity, stay there */ - if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) - return false; - - /* if this cache has capacity, come here */ - if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) + if (idle_cpu(this_cpu)) return true; - /* - * Check to see if we can move the load without causing too much - * imbalance. - */ - task_load = task_h_load(p); - - this_eff_load = 100; - this_eff_load *= prev_stats.capacity; - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= this_stats.capacity; - - this_eff_load *= this_stats.load + task_load; - prev_eff_load *= prev_stats.load - task_load; + if (sync && cpu_rq(this_cpu)->nr_running == 1) + return true; - return this_eff_load <= prev_eff_load; + return false; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine; - - /* - * Default to no affine wakeups; wake_affine() should not effect a task - * placement the load-balancer feels inclined to undo. The conservative - * option is therefore to not move tasks when they wake up. - */ - affine = false; + bool affine = false; - /* - * If the wakeup is across cache domains, try to evaluate if movement - * makes sense, otherwise rely on select_idle_siblings() to do - * placement inside the cache domain. - */ - if (!cpus_share_cache(prev_cpu, this_cpu)) - affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_IDLE) && !affine) + affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (affine) { @@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain_shared *shared = env->sd->shared; struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; @@ -7672,22 +7592,6 @@ next_group: if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; } - - if (!shared) - return; - - /* - * Since these are sums over groups they can contain some CPUs - * multiple times for the NUMA domains. - * - * Currently only wake_affine_llc() and find_busiest_group() - * uses these numbers, only the last is affected by this problem. - * - * XXX fix that. - */ - WRITE_ONCE(shared->nr_running, sds->total_running); - WRITE_ONCE(shared->load, sds->total_load); - WRITE_ONCE(shared->capacity, sds->total_capacity); } /** diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..0a519f8c224d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +SCHED_FEAT(WA_IDLE, true) -- cgit v1.2.3 From f2cdd9cc6c97e617b95f430f527a6e3165e1bee8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Oct 2017 09:23:24 +0200 Subject: sched/core: Address more wake_affine() regressions The trivial wake_affine_idle() implementation is very good for a number of workloads, but it comes apart at the moment there are no idle CPUs left, IOW. the overloaded case. hackbench: NO_WA_WEIGHT WA_WEIGHT hackbench-20 : 7.362717561 seconds 6.450509391 seconds (win) netperf: NO_WA_WEIGHT WA_WEIGHT TCP_SENDFILE-1 : Avg: 54524.6 Avg: 52224.3 TCP_SENDFILE-10 : Avg: 48185.2 Avg: 46504.3 TCP_SENDFILE-20 : Avg: 29031.2 Avg: 28610.3 TCP_SENDFILE-40 : Avg: 9819.72 Avg: 9253.12 TCP_SENDFILE-80 : Avg: 5355.3 Avg: 4687.4 TCP_STREAM-1 : Avg: 41448.3 Avg: 42254 TCP_STREAM-10 : Avg: 24123.2 Avg: 25847.9 TCP_STREAM-20 : Avg: 15834.5 Avg: 18374.4 TCP_STREAM-40 : Avg: 5583.91 Avg: 5599.57 TCP_STREAM-80 : Avg: 2329.66 Avg: 2726.41 TCP_RR-1 : Avg: 80473.5 Avg: 82638.8 TCP_RR-10 : Avg: 72660.5 Avg: 73265.1 TCP_RR-20 : Avg: 52607.1 Avg: 52634.5 TCP_RR-40 : Avg: 57199.2 Avg: 56302.3 TCP_RR-80 : Avg: 25330.3 Avg: 26867.9 UDP_RR-1 : Avg: 108266 Avg: 107844 UDP_RR-10 : Avg: 95480 Avg: 95245.2 UDP_RR-20 : Avg: 68770.8 Avg: 68673.7 UDP_RR-40 : Avg: 76231 Avg: 75419.1 UDP_RR-80 : Avg: 34578.3 Avg: 35639.1 UDP_STREAM-1 : Avg: 64684.3 Avg: 66606 UDP_STREAM-10 : Avg: 52701.2 Avg: 52959.5 UDP_STREAM-20 : Avg: 30376.4 Avg: 29704 UDP_STREAM-40 : Avg: 15685.8 Avg: 15266.5 UDP_STREAM-80 : Avg: 8415.13 Avg: 7388.97 (wins and losses) sysbench: NO_WA_WEIGHT WA_WEIGHT sysbench-mysql-2 : 2135.17 per sec. 2142.51 per sec. sysbench-mysql-5 : 4809.68 per sec. 4800.19 per sec. sysbench-mysql-10 : 9158.59 per sec. 9157.05 per sec. sysbench-mysql-20 : 14570.70 per sec. 14543.55 per sec. sysbench-mysql-40 : 22130.56 per sec. 22184.82 per sec. sysbench-mysql-80 : 20995.56 per sec. 21904.18 per sec. sysbench-psql-2 : 1679.58 per sec. 1705.06 per sec. sysbench-psql-5 : 3797.69 per sec. 3879.93 per sec. sysbench-psql-10 : 7253.22 per sec. 7258.06 per sec. sysbench-psql-20 : 11166.75 per sec. 11220.00 per sec. sysbench-psql-40 : 17277.28 per sec. 17359.78 per sec. sysbench-psql-80 : 17112.44 per sec. 17221.16 per sec. (increase on the top end) tbench: NO_WA_WEIGHT Throughput 685.211 MB/sec 2 clients 2 procs max_latency=0.123 ms Throughput 1596.64 MB/sec 5 clients 5 procs max_latency=0.119 ms Throughput 2985.47 MB/sec 10 clients 10 procs max_latency=0.262 ms Throughput 4521.15 MB/sec 20 clients 20 procs max_latency=0.506 ms Throughput 9438.1 MB/sec 40 clients 40 procs max_latency=2.052 ms Throughput 8210.5 MB/sec 80 clients 80 procs max_latency=8.310 ms WA_WEIGHT Throughput 697.292 MB/sec 2 clients 2 procs max_latency=0.127 ms Throughput 1596.48 MB/sec 5 clients 5 procs max_latency=0.080 ms Throughput 2975.22 MB/sec 10 clients 10 procs max_latency=0.254 ms Throughput 4575.14 MB/sec 20 clients 20 procs max_latency=0.502 ms Throughput 9468.65 MB/sec 40 clients 40 procs max_latency=2.069 ms Throughput 8631.73 MB/sec 80 clients 80 procs max_latency=8.605 ms (increase on the top end) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Rik van Riel Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/features.h | 2 ++ 2 files changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28cabed85387..ed2ab474ec93 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5363,6 +5363,10 @@ static int wake_wide(struct task_struct *p) * * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or * will be) idle. + * + * wake_affine_weight() - considers the weight to reflect the average + * scheduling latency of the CPUs. This seems to work + * for the overloaded case. */ static bool @@ -5378,6 +5382,40 @@ wake_affine_idle(struct sched_domain *sd, struct task_struct *p, return false; } +static bool +wake_affine_weight(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) +{ + s64 this_eff_load, prev_eff_load; + unsigned long task_load; + + this_eff_load = target_load(this_cpu, sd->wake_idx); + prev_eff_load = source_load(prev_cpu, sd->wake_idx); + + if (sync) { + unsigned long current_load = task_h_load(current); + + if (current_load > this_eff_load) + return true; + + this_eff_load -= current_load; + } + + task_load = task_h_load(p); + + this_eff_load += task_load; + if (sched_feat(WA_BIAS)) + this_eff_load *= 100; + this_eff_load *= capacity_of(prev_cpu); + + prev_eff_load -= task_load; + if (sched_feat(WA_BIAS)) + prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); + + return this_eff_load <= prev_eff_load; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { @@ -5387,6 +5425,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (sched_feat(WA_IDLE) && !affine) affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_WEIGHT) && !affine) + affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); + schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (affine) { schedstat_inc(sd->ttwu_move_affine); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 0a519f8c224d..319ed0e8a347 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -82,3 +82,5 @@ SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(WA_IDLE, true) +SCHED_FEAT(WA_WEIGHT, true) +SCHED_FEAT(WA_BIAS, true) -- cgit v1.2.3 From 024c9d2faebdad3fb43fe49ad68e91a36190f1e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Oct 2017 10:36:53 +0200 Subject: sched/core: Ensure load_balance() respects the active_mask While load_balance() masks the source CPUs against active_mask, it had a hole against the destination CPU. Ensure the destination CPU is also part of the 'domain-mask & active-mask' set. Reported-by: Levin, Alexander (Sasha Levin) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds") Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ed2ab474ec93..d3f3094856fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8042,6 +8042,13 @@ static int should_we_balance(struct lb_env *env) struct sched_group *sg = env->sd->groups; int cpu, balance_cpu = -1; + /* + * Ensure the balancing environment is consistent; can happen + * when the softirq triggers 'during' hotplug. + */ + if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) + return 0; + /* * In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. -- cgit v1.2.3 From 62cb1188ed86a9cf082fd2f757d4dd9b54741f24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Aug 2017 15:07:54 +0200 Subject: sched/idle: Move quiet_vmstate() into the NOHZ code quiet_vmstat() is an expensive function that only makes sense when we go into NOHZ. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: aubrey.li@linux.intel.com Cc: cl@linux.com Cc: fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 1 - kernel/time/tick-sched.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 257f4f0b4532..b2e8f0afbb42 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,7 +219,6 @@ static void do_idle(void) */ __current_set_polling(); - quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c7a899c5ce64..7b258c59d78a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -787,6 +788,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (!ts->tick_stopped) { calc_load_nohz_start(); cpu_load_update_nohz_start(); + quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; -- cgit v1.2.3 From 1d48b080bcce0a5e7d7aa2dbcdb35deefc188c3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Sep 2017 13:50:16 +0200 Subject: sched/debug: Rename task-state printing helpers Steve requested better names for the new task-state helper functions. So introduce the concept of task-state index for the printing and rename __get_task_state() to task_state_index() and __task_state_to_char() to task_index_to_char(). Requested-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929115016.pzlqc7ss3ccystyg@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 12 ++++++------ kernel/trace/trace_sched_wakeup.c | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c738e764e2a5..90db994ac900 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = __task_state_to_char(field->next_state); - S = __task_state_to_char(field->prev_state); + T = task_index_to_char(field->next_state); + S = task_index_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = __task_state_to_char(field->prev_state); - T = __task_state_to_char(field->next_state); + S = task_index_to_char(field->prev_state); + T = task_index_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = __task_state_to_char(field->prev_state); - T = __task_state_to_char(field->next_state); + S = task_index_to_char(field->prev_state); + T = task_index_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0c331978b1a6..500f370d3bb1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = __get_task_state(prev); + entry->prev_state = task_state_index(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = __get_task_state(next); + entry->next_state = task_state_index(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = __get_task_state(curr); + entry->prev_state = task_state_index(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = __get_task_state(wakee); + entry->next_state = task_state_index(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) -- cgit v1.2.3 From e964d3501b64d6930aaa4dd18955a8cd086ccb92 Mon Sep 17 00:00:00 2001 From: luca abeni Date: Thu, 7 Sep 2017 12:09:28 +0200 Subject: sched/headers: Remove duplicate prototype of __dl_clear_params() Signed-off-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-2-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e83d1b8be611..9d5aa18fc9bf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy, extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); -extern void __dl_clear_params(struct task_struct *p); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); -- cgit v1.2.3 From 295d6d5e373607729bcc8182c25afe964655714f Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 7 Sep 2017 12:09:29 +0200 Subject: sched/deadline: Fix switching to -deadline Fix a bug introduced in: 72f9f3fdc928 ("sched/deadline: Remove dl_new from struct sched_dl_entity") After that commit, when switching to -deadline if the scheduling deadline of a task is in the past then switched_to_dl() calls setup_new_entity() to properly initialize the scheduling deadline and runtime. The problem is that the task is enqueued _before_ having its parameters initialized by setup_new_entity(), and this can cause problems. For example, a task with its out-of-date deadline in the past will potentially be enqueued as the highest priority one; however, its adjusted deadline may not be the earliest one. This patch fixes the problem by initializing the task's parameters before enqueuing it. Signed-off-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-3-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0191ec7667c3..9d45354e4296 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1364,6 +1364,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } else if ((flags & ENQUEUE_RESTORE) && + dl_time_before(dl_se->deadline, + rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { + setup_new_dl_entity(dl_se); } __enqueue_dl_entity(dl_se); @@ -2255,13 +2259,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - /* - * If p is boosted we already updated its params in - * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), - * p's deadline being now already after rq_clock(rq). - */ - if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(&p->dl); if (rq->curr != p) { #ifdef CONFIG_SMP -- cgit v1.2.3 From 8c0944cee7af55291df0b28e6e2eeac0930e93c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 12:09:30 +0200 Subject: sched/deadline: Rename __dl_clear() to __dl_sub() __dl_sub() is more meaningful as a name, and is more consistent with the naming of the dual function (__dl_add()). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504778971-13573-4-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 10 +++++----- kernel/sched/sched.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9d45354e4296..8d1b946fa684 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -242,7 +242,7 @@ static void task_non_contending(struct task_struct *p) if (p->state == TASK_DEAD) sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); } @@ -1209,7 +1209,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) } raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); __dl_clear_params(p); @@ -2170,7 +2170,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(&src_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&src_dl_b->lock); } @@ -2448,7 +2448,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && @@ -2460,7 +2460,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, * But this would require to set the task's "inactive * timer" when the task is not inactive. */ - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); dl_change_utilization(p, new_bw); err = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9d5aa18fc9bf..a81c9782e98c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -226,7 +226,7 @@ struct dl_bw { static inline void __dl_update(struct dl_bw *dl_b, s64 bw); static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; __dl_update(dl_b, (s32)tsk_bw / cpus); -- cgit v1.2.3 From ed4ad1ca08a53cf1a805478678d1e7ff0d2cf251 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Oct 2017 14:50:33 +0200 Subject: sched/topology: Restore SD_PREFER_SIBLING on MC domains The normal x86_topology on NHM+ machines degenerates because the MC and CPU domains are of the same size, therefore MC inherits SD_PREFER_SIBLING from CPU (which then gets taken out). The result is that we'll spread tasks across the first NUMA level in order to maximize cache utilization. However, for the x86_numa_in_package_topology we loose the CPU domain, and we'll not have SD_PREFER_SIBLING set anywhere, giving a distinct difference in behaviour. Commit: 8e7fbcbc22c1 ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs") made a fail by not preserving the SD_PREFER_SIBLING for the !power_saving case on both CPU and MC. Then commit: 6956dc568f34 ("sched/numa: Add SD_PERFER_SIBLING to CPU domain") adds it back to the CPU but not MC. Restore that now, such that we get consistent spreading behaviour wrt L3 and NUMA. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f1cf4f306a82..86e81f06d36b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1157,6 +1157,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 117; sd->cache_nice_tries = 1; sd->busy_idx = 2; -- cgit v1.2.3 From 051f3ca02e46432c0965e8948f00c07d8a2f09c0 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 7 Sep 2017 02:20:05 -0500 Subject: sched/topology: Introduce NUMA identity node sched domain On AMD Family17h-based (EPYC) system, a logical NUMA node can contain upto 8 cores (16 threads) with the following topology. ---------------------------- C0 | T0 T1 | || | T0 T1 | C4 --------| || |-------- C1 | T0 T1 | L3 || L3 | T0 T1 | C5 --------| || |-------- C2 | T0 T1 | #0 || #1 | T0 T1 | C6 --------| || |-------- C3 | T0 T1 | || | T0 T1 | C7 ---------------------------- Here, there are 2 last-level (L3) caches per logical NUMA node. A socket can contain upto 4 NUMA nodes, and a system can support upto 2 sockets. With full system configuration, current scheduler creates 4 sched domains: domain0 SMT (span a core) domain1 MC (span a last-level-cache) domain2 NUMA (span a socket: 4 nodes) domain3 NUMA (span a system: 8 nodes) Note that there is no domain to represent cpus spaning a logical NUMA node. With this hierarchy of sched domains, the scheduler does not balance properly in the following cases: Case1: When running 8 tasks, a properly balanced system should schedule a task per logical NUMA node. This is not the case for the current scheduler. Case2: In some cases, threads are scheduled on the same cpu, while other cpus are idle. This results in run-to-run inconsistency. For example: taskset -c 0-7 sysbench --num-threads=8 --test=cpu \ --cpu-max-prime=100000 run Total execution time ranges from 25.1s to 33.5s depending on threads placement, where 25.1s is when all 8 threads are balanced properly on 8 cpus. Introducing NUMA identity node sched domain, which is based on how SRAT/SLIT table define a logical NUMA node. This results in the following hierarchy of sched domains on the same system described above. domain0 SMT (span a core) domain1 MC (span a last-level-cache) domain2 NODE (span a logical NUMA node) domain3 NUMA (span a socket: 4 nodes) domain4 NUMA (span a system: 8 nodes) This fixes the improper load balancing cases mentioned above. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Link: http://lkml.kernel.org/r/1504768805-46716-1-git-send-email-suravee.suthikulpanit@amd.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 86e81f06d36b..f51d123f9fe1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1332,6 +1332,10 @@ void sched_init_numa(void) if (!sched_domains_numa_distance) return; + /* Includes NUMA identity node at level 0. */ + sched_domains_numa_distance[level++] = curr_distance; + sched_domains_numa_levels = level; + /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. @@ -1379,8 +1383,7 @@ void sched_init_numa(void) return; /* - * 'level' contains the number of unique distances, excluding the - * identity distance node_distance(i,i). + * 'level' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1441,10 +1444,19 @@ void sched_init_numa(void) for (i = 0; sched_domain_topology[i].mask; i++) tl[i] = sched_domain_topology[i]; + /* + * Add the NUMA identity distance, aka single NODE. + */ + tl[i++] = (struct sched_domain_topology_level){ + .mask = sd_numa_mask, + .numa_level = 0, + SD_INIT_NAME(NODE) + }; + /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 0; j < level; i++, j++) { + for (j = 1; j < level; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, -- cgit v1.2.3 From 93824900a2e242766f5fe6ae7697e3d7171aa234 Mon Sep 17 00:00:00 2001 From: Uladzislau Rezki Date: Wed, 13 Sep 2017 12:24:30 +0200 Subject: sched/fair: Search a task from the tail of the queue As a first step this patch makes cfs_tasks list as MRU one. It means, that when a next task is picked to run on physical CPU it is moved to the front of the list. Therefore, the cfs_tasks list is more or less sorted (except woken tasks) starting from recently given CPU time tasks toward tasks with max wait time in a run-queue, i.e. MRU list. Second, as part of the load balance operation, this approach starts detach_tasks()/detach_one_task() from the tail of the queue instead of the head, giving some advantages: - tends to pick a task with highest wait time; - tasks located in the tail are less likely cache-hot, therefore the can_migrate_task() decision is higher. hackbench illustrates slightly better performance. For example doing 1000 samples and 40 groups on i5-3320M CPU, it shows below figures: default: 0.657 avg patched: 0.646 avg Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Mike Galbraith Cc: Nicolas Pitre Cc: Oleg Nesterov Cc: Oleksiy Avramchenko Cc: Paul Turner Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20170913102430.8985-2-urezki@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ac6602c5f36f..cc0bfb035df9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6628,10 +6628,7 @@ again: set_next_entity(cfs_rq, se); } - if (hrtick_enabled(rq)) - hrtick_start_fair(rq, p); - - return p; + goto done; simple: #endif @@ -6645,6 +6642,16 @@ simple: p = task_of(se); +done: __maybe_unused +#ifdef CONFIG_SMP + /* + * Move the next running task to the front of + * the list, so our cfs_tasks list becomes MRU + * one. + */ + list_move(&p->se.group_node, &rq->cfs_tasks); +#endif + if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); @@ -7080,11 +7087,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) */ static struct task_struct *detach_one_task(struct lb_env *env) { - struct task_struct *p, *n; + struct task_struct *p; lockdep_assert_held(&env->src_rq->lock); - list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + list_for_each_entry_reverse(p, + &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; @@ -7130,7 +7138,7 @@ static int detach_tasks(struct lb_env *env) if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) break; - p = list_first_entry(tasks, struct task_struct, se.group_node); + p = list_last_entry(tasks, struct task_struct, se.group_node); env->loop++; /* We've more or less seen every task there is, call it quits */ @@ -7180,7 +7188,7 @@ static int detach_tasks(struct lb_env *env) continue; next: - list_move_tail(&p->se.group_node, tasks); + list_move(&p->se.group_node, tasks); } /* -- cgit v1.2.3 From ea16f0ea6c3dc9e1aa083bd3d1792ba02860526e Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 11:55:51 +0100 Subject: sched/fair: Sync task util before slow-path wakeup We use task_util() in find_idlest_group() via capacity_spare_wake(). This task_util() updated in wake_cap(). However wake_cap() is not the only reason for ending up in find_idlest_group() - we could have been sent there by wake_wide(). So explicitly sync the task util with prev_cpu when we are about to head to find_idlest_group(). We could simply do this at the beginning of select_task_rq_fair() (i.e. irrespective of whether we're heading to select_idle_sibling() or find_idlest_group() & co), but I didn't want to slow down the select_idle_sibling() path more than necessary. Don't do this during fork balancing, we won't need the task_util and we'd just clobber the last_update_time, which is supposed to be 0. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Cc: Andres Oportus Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20170808095519.10077-1-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cc0bfb035df9..c04a42556a59 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6263,8 +6263,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { - pick_cpu: +pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); -- cgit v1.2.3 From 583ffd99d7657755736d831bbc182612d1d2697d Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 11:58:54 +0100 Subject: sched/fair: Force balancing on NOHZ balance if local group has capacity The "goto force_balance" here is intended to mitigate the fact that avg_load calculations can result in bad placement decisions when priority is asymmetrical. The original commit that adds it: fab476228ba3 ("sched: Force balancing on newidle balance if local group has capacity") explains: Under certain situations, such as a niced down task (i.e. nice = -15) in the presence of nr_cpus NICE0 tasks, the niced task lands on a sched group and kicks away other tasks because of its large weight. This leads to sub-optimal utilization of the machine. Even though the sched group has capacity, it does not pull tasks because sds.this_load >> sds.max_load, and f_b_g() returns NULL. A similar but inverted issue also affects ARM big.LITTLE (asymmetrical CPU capacity) systems - consider 8 always-running, same-priority tasks on a system with 4 "big" and 4 "little" CPUs. Suppose that 5 of them end up on the "big" CPUs (which will be represented by one sched_group in the DIE sched_domain) and 3 on the "little" (the other sched_group in DIE), leaving one CPU unused. Because the "big" group has a higher group_capacity its avg_load may not present an imbalance that would cause migrating a task to the idle "little". The force_balance case here solves the problem but currently only for CPU_NEWLY_IDLE balances, which in theory might never happen on the unused CPU. Including CPU_IDLE in the force_balance case means there's an upper bound on the time before we can attempt to solve the underutilization: after DIE's sd->balance_interval has passed the next nohz balance kick will help us out. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170807163900.25180-1-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c04a42556a59..cf3e816db80f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8186,8 +8186,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; -- cgit v1.2.3 From 18bd1b4bd53aba81d76d55e91a68310a227dc187 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:12 +0100 Subject: sched/fair: Move select_task_rq_fair() slow-path into its own function In preparation for changes that would otherwise require adding a new level of indentation to the while(sd) loop, create a new function find_idlest_cpu() which contains this loop, and rename the existing find_idlest_cpu() to find_idlest_group_cpu(). Code inside the while(sd) loop is unchanged. @new_cpu is added as a variable in the new function, with the same initial value as the @new_cpu in select_task_rq_fair(). Suggested-by: Peter Zijlstra Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-2-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 83 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf3e816db80f..dd4f25305604 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5859,10 +5859,10 @@ skip_spare: } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5911,6 +5911,50 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int new_cpu = prev_cpu; + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return new_cpu; +} + #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -6277,39 +6321,8 @@ pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } else while (sd) { - struct sched_group *group; - int weight; - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ + } else { + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); -- cgit v1.2.3 From e90381eaecf6d59c60fe396838e0e99789531419 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:13 +0100 Subject: sched/fair: Remove unnecessary comparison with -1 Since commit: 83a0a96a5f26 ("sched/fair: Leverage the idle state info when choosing the "idlest" cpu") find_idlest_group_cpu() (formerly find_idlest_cpu) no longer returns -1, so we can simplify the checking of the return value in find_idlest_cpu(). Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-3-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd4f25305604..bdd28fbad7a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5933,7 +5933,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p } new_cpu = find_idlest_group_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { + if (new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; continue; -- cgit v1.2.3 From 0d10ab952e99f3e9f374898e93f45452b81e5711 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:14 +0100 Subject: sched/fair: Fix find_idlest_group() when local group is not allowed When the local group is not allowed we do not modify this_*_load from their initial value of 0. That means that the load checks at the end of find_idlest_group cause us to incorrectly return NULL. Fixing the initial values to ULONG_MAX means we will instead return the idlest remote group in that case. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Josef Bacik Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-4-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bdd28fbad7a3..cca1835efa7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5737,8 +5737,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long this_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; -- cgit v1.2.3 From 6fee85ccbc76e8aeba43dc120c5fa3c5409a4e2c Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:15 +0100 Subject: sched/fair: Fix usage of find_idlest_group() when no groups are allowed When 'p' is not allowed on any of the CPUs in the sched_domain, we currently return NULL from find_idlest_group(), and pointlessly continue the search on lower sched_domain levels (where 'p' is also not allowed) before returning prev_cpu regardless (as we have not updated new_cpu). Add an explicit check for this case, and add a comment to find_idlest_group(). Now when find_idlest_group() returns NULL, it always means that the local group is allowed and idlest. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Josef Bacik Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-5-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cca1835efa7b..ed80d6bd76c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5730,6 +5730,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, @@ -5917,6 +5919,9 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = prev_cpu; + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + while (sd) { struct sched_group *group; struct sched_domain *tmp; -- cgit v1.2.3 From 93f50f90247e3e926bbe9830df089c64a5cec236 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 5 Oct 2017 12:45:16 +0100 Subject: sched/fair: Fix usage of find_idlest_group() when the local group is idlest find_idlest_group() returns NULL when the local group is idlest. The caller then continues the find_idlest_group() search at a lower level of the current CPU's sched_domain hierarchy. find_idlest_group_cpu() is not consulted and, crucially, @new_cpu is not updated. This means the search is pointless and we return @prev_cpu from select_task_rq_fair(). This is fixed by initialising @new_cpu to @cpu instead of @prev_cpu. Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-6-brendan.jackman@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ed80d6bd76c8..56f343b8e749 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5917,7 +5917,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { - int new_cpu = prev_cpu; + int new_cpu = cpu; if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) return prev_cpu; -- cgit v1.2.3 From 4bdced5c9a2922521e325896a7bbbf0132c94e56 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 6 Oct 2017 14:05:04 -0400 Subject: sched/rt: Simplify the IPI based RT balancing logic When a CPU lowers its priority (schedules out a high priority task for a lower priority one), a check is made to see if any other CPU has overloaded RT tasks (more than one). It checks the rto_mask to determine this and if so it will request to pull one of those tasks to itself if the non running RT task is of higher priority than the new priority of the next task to run on the current CPU. When we deal with large number of CPUs, the original pull logic suffered from large lock contention on a single CPU run queue, which caused a huge latency across all CPUs. This was caused by only having one CPU having overloaded RT tasks and a bunch of other CPUs lowering their priority. To solve this issue, commit: b6366f048e0c ("sched/rt: Use IPI to trigger RT task push migration instead of pulling") changed the way to request a pull. Instead of grabbing the lock of the overloaded CPU's runqueue, it simply sent an IPI to that CPU to do the work. Although the IPI logic worked very well in removing the large latency build up, it still could suffer from a large number of IPIs being sent to a single CPU. On a 80 CPU box, I measured over 200us of processing IPIs. Worse yet, when I tested this on a 120 CPU box, with a stress test that had lots of RT tasks scheduling on all CPUs, it actually triggered the hard lockup detector! One CPU had so many IPIs sent to it, and due to the restart mechanism that is triggered when the source run queue has a priority status change, the CPU spent minutes! processing the IPIs. Thinking about this further, I realized there's no reason for each run queue to send its own IPI. As all CPUs with overloaded tasks must be scanned regardless if there's one or many CPUs lowering their priority, because there's no current way to find the CPU with the highest priority task that can schedule to one of these CPUs, there really only needs to be one IPI being sent around at a time. This greatly simplifies the code! The new approach is to have each root domain have its own irq work, as the rto_mask is per root domain. The root domain has the following fields attached to it: rto_push_work - the irq work to process each CPU set in rto_mask rto_lock - the lock to protect some of the other rto fields rto_loop_start - an atomic that keeps contention down on rto_lock the first CPU scheduling in a lower priority task is the one to kick off the process. rto_loop_next - an atomic that gets incremented for each CPU that schedules in a lower priority task. rto_loop - a variable protected by rto_lock that is used to compare against rto_loop_next rto_cpu - The cpu to send the next IPI to, also protected by the rto_lock. When a CPU schedules in a lower priority task and wants to make sure overloaded CPUs know about it. It increments the rto_loop_next. Then it atomically sets rto_loop_start with a cmpxchg. If the old value is not "0", then it is done, as another CPU is kicking off the IPI loop. If the old value is "0", then it will take the rto_lock to synchronize with a possible IPI being sent around to the overloaded CPUs. If rto_cpu is greater than or equal to nr_cpu_ids, then there's either no IPI being sent around, or one is about to finish. Then rto_cpu is set to the first CPU in rto_mask and an IPI is sent to that CPU. If there's no CPUs set in rto_mask, then there's nothing to be done. When the CPU receives the IPI, it will first try to push any RT tasks that is queued on the CPU but can't run because a higher priority RT task is currently running on that CPU. Then it takes the rto_lock and looks for the next CPU in the rto_mask. If it finds one, it simply sends an IPI to that CPU and the process continues. If there's no more CPUs in the rto_mask, then rto_loop is compared with rto_loop_next. If they match, everything is done and the process is over. If they do not match, then a CPU scheduled in a lower priority task as the IPI was being passed around, and the process needs to start again. The first CPU in rto_mask is sent the IPI. This change removes this duplication of work in the IPI logic, and greatly lowers the latency caused by the IPIs. This removed the lockup happening on the 120 CPU machine. It also simplifies the code tremendously. What else could anyone ask for? Thanks to Peter Zijlstra for simplifying the rto_loop_start atomic logic and supplying me with the rto_start_trylock() and rto_start_unlock() helper functions. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Cc: Clark Williams Cc: Daniel Bristot de Oliveira Cc: John Kacur Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Scott Wood Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170424114732.1aac6dc4@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 316 ++++++++++++++++++------------------------------ kernel/sched/sched.h | 24 ++-- kernel/sched/topology.c | 6 + 3 files changed, 138 insertions(+), 208 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0af5ca9e3e3f..fda27991699a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -73,10 +73,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) -static void push_irq_work_func(struct irq_work *work); -#endif - void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -96,13 +92,6 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); - -#ifdef HAVE_RT_PUSH_IPI - rt_rq->push_flags = 0; - rt_rq->push_cpu = nr_cpu_ids; - raw_spin_lock_init(&rt_rq->push_lock); - init_irq_work(&rt_rq->push_work, push_irq_work_func); -#endif #endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -1875,241 +1864,166 @@ static void push_rt_tasks(struct rq *rq) } #ifdef HAVE_RT_PUSH_IPI + /* - * The search for the next cpu always starts at rq->cpu and ends - * when we reach rq->cpu again. It will never return rq->cpu. - * This returns the next cpu to check, or nr_cpu_ids if the loop - * is complete. + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * All CPUs with overloaded RT tasks need to be notified as there is currently + * no way to know which of these CPUs have the highest priority task waiting + * to run. Instead of trying to take a spinlock on each of these CPUs, + * which has shown to cause large latency when done on machines with many + * CPUs, sending an IPI to the CPUs to have them push off the overloaded + * RT tasks waiting to run. + * + * Just sending an IPI to each of the CPUs is also an issue, as on large + * count CPU machines, this can cause an IPI storm on a CPU, especially + * if its the only CPU with multiple RT tasks queued, and a large number + * of CPUs scheduling a lower priority task at the same time. + * + * Each root domain has its own irq work function that can iterate over + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT + * tassk must be checked if there's one or many CPUs that are lowering + * their priority, there's a single irq work iterator that will try to + * push off RT tasks that are waiting to run. + * + * When a CPU schedules a lower priority task, it will kick off the + * irq work iterator that will jump to each CPU with overloaded RT tasks. + * As it only takes the first CPU that schedules a lower priority task + * to start the process, the rto_start variable is incremented and if + * the atomic result is one, then that CPU will try to take the rto_lock. + * This prevents high contention on the lock as the process handles all + * CPUs scheduling lower priority tasks. + * + * All CPUs that are scheduling a lower priority task will increment the + * rt_loop_next variable. This will make sure that the irq work iterator + * checks all RT overloaded CPUs whenever a CPU schedules a new lower + * priority task, even if the iterator is in the middle of a scan. Incrementing + * the rt_loop_next will cause the iterator to perform another scan. * - * rq->rt.push_cpu holds the last cpu returned by this function, - * or if this is the first instance, it must hold rq->cpu. */ static int rto_next_cpu(struct rq *rq) { - int prev_cpu = rq->rt.push_cpu; + struct root_domain *rd = rq->rd; + int next; int cpu; - cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); - /* - * If the previous cpu is less than the rq's CPU, then it already - * passed the end of the mask, and has started from the beginning. - * We end if the next CPU is greater or equal to rq's CPU. + * When starting the IPI RT pushing, the rto_cpu is set to -1, + * rt_next_cpu() will simply return the first CPU found in + * the rto_mask. + * + * If rto_next_cpu() is called with rto_cpu is a valid cpu, it + * will return the next CPU found in the rto_mask. + * + * If there are no more CPUs left in the rto_mask, then a check is made + * against rto_loop and rto_loop_next. rto_loop is only updated with + * the rto_lock held, but any CPU may increment the rto_loop_next + * without any locking. */ - if (prev_cpu < rq->cpu) { - if (cpu >= rq->cpu) - return nr_cpu_ids; + for (;;) { - } else if (cpu >= nr_cpu_ids) { - /* - * We passed the end of the mask, start at the beginning. - * If the result is greater or equal to the rq's CPU, then - * the loop is finished. - */ - cpu = cpumask_first(rq->rd->rto_mask); - if (cpu >= rq->cpu) - return nr_cpu_ids; - } - rq->rt.push_cpu = cpu; + /* When rto_cpu is -1 this acts like cpumask_first() */ + cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); - /* Return cpu to let the caller know if the loop is finished or not */ - return cpu; -} + rd->rto_cpu = cpu; -static int find_next_push_cpu(struct rq *rq) -{ - struct rq *next_rq; - int cpu; + if (cpu < nr_cpu_ids) + return cpu; - while (1) { - cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - break; - next_rq = cpu_rq(cpu); + rd->rto_cpu = -1; + + /* + * ACQUIRE ensures we see the @rto_mask changes + * made prior to the @next value observed. + * + * Matches WMB in rt_set_overload(). + */ + next = atomic_read_acquire(&rd->rto_loop_next); - /* Make sure the next rq can push to this rq */ - if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + if (rd->rto_loop == next) break; + + rd->rto_loop = next; } - return cpu; + return -1; } -#define RT_PUSH_IPI_EXECUTING 1 -#define RT_PUSH_IPI_RESTART 2 +static inline bool rto_start_trylock(atomic_t *v) +{ + return !atomic_cmpxchg_acquire(v, 0, 1); +} -/* - * When a high priority task schedules out from a CPU and a lower priority - * task is scheduled in, a check is made to see if there's any RT tasks - * on other CPUs that are waiting to run because a higher priority RT task - * is currently running on its CPU. In this case, the CPU with multiple RT - * tasks queued on it (overloaded) needs to be notified that a CPU has opened - * up that may be able to run one of its non-running queued RT tasks. - * - * On large CPU boxes, there's the case that several CPUs could schedule - * a lower priority task at the same time, in which case it will look for - * any overloaded CPUs that it could pull a task from. To do this, the runqueue - * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting - * for a single overloaded CPU's runqueue lock can produce a large latency. - * (This has actually been observed on large boxes running cyclictest). - * Instead of taking the runqueue lock of the overloaded CPU, each of the - * CPUs that scheduled a lower priority task simply sends an IPI to the - * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with - * lots of contention. The overloaded CPU will look to push its non-running - * RT task off, and if it does, it can then ignore the other IPIs coming - * in, and just pass those IPIs off to any other overloaded CPU. - * - * When a CPU schedules a lower priority task, it only sends an IPI to - * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, - * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with - * RT overloaded tasks, would cause 100 IPIs to go out at once. - * - * The overloaded RT CPU, when receiving an IPI, will try to push off its - * overloaded RT tasks and then send an IPI to the next CPU that has - * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks - * have completed. Just because a CPU may have pushed off its own overloaded - * RT task does not mean it should stop sending the IPI around to other - * overloaded CPUs. There may be another RT task waiting to run on one of - * those CPUs that are of higher priority than the one that was just - * pushed. - * - * An optimization that could possibly be made is to make a CPU array similar - * to the cpupri array mask of all running RT tasks, but for the overloaded - * case, then the IPI could be sent to only the CPU with the highest priority - * RT task waiting, and that CPU could send off further IPIs to the CPU with - * the next highest waiting task. Since the overloaded case is much less likely - * to happen, the complexity of this implementation may not be worth it. - * Instead, just send an IPI around to all overloaded CPUs. - * - * The rq->rt.push_flags holds the status of the IPI that is going around. - * A run queue can only send out a single IPI at a time. The possible flags - * for rq->rt.push_flags are: - * - * (None or zero): No IPI is going around for the current rq - * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around - * RT_PUSH_IPI_RESTART: The priority of the running task for the rq - * has changed, and the IPI should restart - * circulating the overloaded CPUs again. - * - * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated - * before sending to the next CPU. - * - * Instead of having all CPUs that schedule a lower priority task send - * an IPI to the same "first" CPU in the RT overload mask, they send it - * to the next overloaded CPU after their own CPU. This helps distribute - * the work when there's more than one overloaded CPU and multiple CPUs - * scheduling in lower priority tasks. - * - * When a rq schedules a lower priority task than what was currently - * running, the next CPU with overloaded RT tasks is examined first. - * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower - * priority task, it will send an IPI first to CPU 5, then CPU 5 will - * send to CPU 1 if it is still overloaded. CPU 1 will clear the - * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. - * - * The first CPU to notice IPI_RESTART is set, will clear that flag and then - * send an IPI to the next overloaded CPU after the rq->cpu and not the next - * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 - * schedules a lower priority task, and the IPI_RESTART gets set while the - * handling is being done on CPU 5, it will clear the flag and send it back to - * CPU 4 instead of CPU 1. - * - * Note, the above logic can be disabled by turning off the sched_feature - * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be - * taken by the CPU requesting a pull and the waiting RT task will be pulled - * by that CPU. This may be fine for machines with few CPUs. - */ -static void tell_cpu_to_push(struct rq *rq) +static inline void rto_start_unlock(atomic_t *v) { - int cpu; + atomic_set_release(v, 0); +} - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - raw_spin_lock(&rq->rt.push_lock); - /* Make sure it's still executing */ - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - /* - * Tell the IPI to restart the loop as things have - * changed since it started. - */ - rq->rt.push_flags |= RT_PUSH_IPI_RESTART; - raw_spin_unlock(&rq->rt.push_lock); - return; - } - raw_spin_unlock(&rq->rt.push_lock); - } +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu = -1; - /* When here, there's no IPI going around */ + /* Keep the loop going if the IPI is currently active */ + atomic_inc(&rq->rd->rto_loop_next); - rq->rt.push_cpu = rq->cpu; - cpu = find_next_push_cpu(rq); - if (cpu >= nr_cpu_ids) + /* Only one CPU can initiate a loop at a time */ + if (!rto_start_trylock(&rq->rd->rto_loop_start)) return; - rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + raw_spin_lock(&rq->rd->rto_lock); + + /* + * The rto_cpu is updated under the lock, if it has a valid cpu + * then the IPI is still running and will continue due to the + * update to loop_next, and nothing needs to be done here. + * Otherwise it is finishing up and an ipi needs to be sent. + */ + if (rq->rd->rto_cpu < 0) + cpu = rto_next_cpu(rq); - irq_work_queue_on(&rq->rt.push_work, cpu); + raw_spin_unlock(&rq->rd->rto_lock); + + rto_start_unlock(&rq->rd->rto_loop_start); + + if (cpu >= 0) + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } /* Called from hardirq context */ -static void try_to_push_tasks(void *arg) +void rto_push_irq_work_func(struct irq_work *work) { - struct rt_rq *rt_rq = arg; - struct rq *rq, *src_rq; - int this_cpu; + struct rq *rq; int cpu; - this_cpu = rt_rq->push_cpu; + rq = this_rq(); - /* Paranoid check */ - BUG_ON(this_cpu != smp_processor_id()); - - rq = cpu_rq(this_cpu); - src_rq = rq_of_rt_rq(rt_rq); - -again: + /* + * We do not need to grab the lock to check for has_pushable_tasks. + * When it gets updated, a check is made if a push is possible. + */ if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_task(rq); + push_rt_tasks(rq); raw_spin_unlock(&rq->lock); } - /* Pass the IPI to the next rt overloaded queue */ - raw_spin_lock(&rt_rq->push_lock); - /* - * If the source queue changed since the IPI went out, - * we need to restart the search from that CPU again. - */ - if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { - rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; - rt_rq->push_cpu = src_rq->cpu; - } + raw_spin_lock(&rq->rd->rto_lock); - cpu = find_next_push_cpu(src_rq); + /* Pass the IPI to the next rt overloaded queue */ + cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; - raw_spin_unlock(&rt_rq->push_lock); + raw_spin_unlock(&rq->rd->rto_lock); - if (cpu >= nr_cpu_ids) + if (cpu < 0) return; - /* - * It is possible that a restart caused this CPU to be - * chosen again. Don't bother with an IPI, just see if we - * have more to push. - */ - if (unlikely(cpu == rq->cpu)) - goto again; - /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rt_rq->push_work, cpu); -} - -static void push_irq_work_func(struct irq_work *work) -{ - struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); - - try_to_push_tasks(rt_rq); + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a81c9782e98c..8aa24b41f652 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -505,7 +505,7 @@ static inline int rt_bandwidth_enabled(void) } /* RT IPI pull logic requires IRQ_WORK */ -#ifdef CONFIG_IRQ_WORK +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) # define HAVE_RT_PUSH_IPI #endif @@ -527,12 +527,6 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; -#ifdef HAVE_RT_PUSH_IPI - int push_flags; - int push_cpu; - struct irq_work push_work; - raw_spinlock_t push_lock; -#endif #endif /* CONFIG_SMP */ int rt_queued; @@ -641,6 +635,19 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -658,6 +665,9 @@ extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif #endif /* CONFIG_SMP */ /* diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f51d123f9fe1..e50450c2fed8 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -268,6 +268,12 @@ static int init_rootdomain(struct root_domain *rd) if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; +#ifdef HAVE_RT_PUSH_IPI + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); +#endif + init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; -- cgit v1.2.3 From 76f8507f7a6442215df19de74f07eabca2462f1e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 29 Sep 2017 19:06:38 +0300 Subject: locking/rwsem: Add down_read_killable() Similar to down_read() and down_write_killable(), add killable version of down_read(), based on __down_read_killable() function, added in previous patches. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arnd@arndb.de Cc: avagin@virtuozzo.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: gorcunov@virtuozzo.com Cc: heiko.carstens@de.ibm.com Cc: hpa@zytor.com Cc: ink@jurassic.park.msu.ru Cc: mattst88@gmail.com Cc: rientjes@google.com Cc: rth@twiddle.net Cc: schwidefsky@de.ibm.com Cc: tony.luck@intel.com Cc: viro@zeniv.linux.org.uk Link: http://lkml.kernel.org/r/150670119884.23930.2585570605960763239.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 4d48b1c4870d..e53f7746d9fd 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,6 +28,22 @@ void __sched down_read(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read); +int __sched down_read_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_reader_owned(sem); + return 0; +} + +EXPORT_SYMBOL(down_read_killable); + /* * trylock for reading -- returns 1 if successful, 0 if contention */ -- cgit v1.2.3 From a8a217c22116eff6c120d753c9934089fb229af0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 3 Oct 2017 19:25:27 +0100 Subject: locking/core: Remove {read,spin,write}_can_lock() Outside of the locking code itself, {read,spin,write}_can_lock() have no users in tree. Apparmor (the last remaining user of write_can_lock()) got moved over to lockdep by the previous patch. This patch removes the use of {read,spin,write}_can_lock() from the BUILD_LOCK_OPS macro, deferring to the trylock operation for testing the lock status, and subsequently removes the unused macros altogether. They aren't guaranteed to work in a concurrent environment and can give incorrect results in the case of qrwlock. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1507055129-12300-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/spinlock.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..8fd48b5552a7 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -32,8 +32,6 @@ * include/linux/spinlock_api_smp.h */ #else -#define raw_read_can_lock(l) read_can_lock(l) -#define raw_write_can_lock(l) write_can_lock(l) /* * Some architectures can relax in favour of the CPU owning the lock. @@ -68,7 +66,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + while ((lock)->break_lock) \ arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ @@ -88,7 +86,7 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + while ((lock)->break_lock) \ arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ -- cgit v1.2.3 From de8cd83e91bc3ee212b3e6ec6e4283af9e4ab269 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Mon, 2 Oct 2017 20:21:39 -0400 Subject: audit: Record fanotify access control decisions The fanotify interface allows user space daemons to make access control decisions. Under common criteria requirements, we need to optionally record decisions based on policy. This patch adds a bit mask, FAN_AUDIT, that a user space daemon can 'or' into the response decision which will tell the kernel that it made a decision and record it. It would be used something like this in user space code: response.response = FAN_DENY | FAN_AUDIT; write(fd, &response, sizeof(struct fanotify_response)); When the syscall ends, the audit system will record the decision as a AUDIT_FANOTIFY auxiliary record to denote that the reason this event occurred is the result of an access control decision from fanotify rather than DAC or MAC policy. A sample event looks like this: type=PATH msg=audit(1504310584.332:290): item=0 name="./evil-ls" inode=1319561 dev=fc:03 mode=0100755 ouid=1000 ogid=1000 rdev=00:00 obj=unconfined_u:object_r:user_home_t:s0 nametype=NORMAL type=CWD msg=audit(1504310584.332:290): cwd="/home/sgrubb" type=SYSCALL msg=audit(1504310584.332:290): arch=c000003e syscall=2 success=no exit=-1 a0=32cb3fca90 a1=0 a2=43 a3=8 items=1 ppid=901 pid=959 auid=1000 uid=1000 gid=1000 euid=1000 suid=1000 fsuid=1000 egid=1000 sgid=1000 fsgid=1000 tty=pts1 ses=3 comm="bash" exe="/usr/bin/bash" subj=unconfined_u:unconfined_r:unconfined_t: s0-s0:c0.c1023 key=(null) type=FANOTIFY msg=audit(1504310584.332:290): resp=2 Prior to using the audit flag, the developer needs to call fanotify_init or'ing in FAN_ENABLE_AUDIT to ensure that the kernel supports auditing. The calling process must also have the CAP_AUDIT_WRITE capability. Signed-off-by: sgrubb Reviewed-by: Amir Goldstein Signed-off-by: Jan Kara --- kernel/auditsc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ecc23e25c9eb..9c723e978245 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2390,6 +2390,12 @@ void __audit_log_kern_module(char *name) context->type = AUDIT_KERN_MODULE; } +void __audit_fanotify(unsigned int response) +{ + audit_log(current->audit_context, GFP_KERNEL, + AUDIT_FANOTIFY, "resp=%u", response); +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; -- cgit v1.2.3 From 692b48258dda7c302e777d7d5f4217244478f1f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Oct 2017 08:04:13 -0700 Subject: workqueue: replace pool->manager_arb mutex with a flag Josef reported a HARDIRQ-safe -> HARDIRQ-unsafe lock order detected by lockdep: [ 1270.472259] WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected [ 1270.472783] 4.14.0-rc1-xfstests-12888-g76833e8 #110 Not tainted [ 1270.473240] ----------------------------------------------------- [ 1270.473710] kworker/u5:2/5157 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: [ 1270.474239] (&(&lock->wait_lock)->rlock){+.+.}, at: [] __mutex_unlock_slowpath+0xa2/0x280 [ 1270.474994] [ 1270.474994] and this task is already holding: [ 1270.475440] (&pool->lock/1){-.-.}, at: [] worker_thread+0x366/0x3c0 [ 1270.476046] which would create a new lock dependency: [ 1270.476436] (&pool->lock/1){-.-.} -> (&(&lock->wait_lock)->rlock){+.+.} [ 1270.476949] [ 1270.476949] but this new dependency connects a HARDIRQ-irq-safe lock: [ 1270.477553] (&pool->lock/1){-.-.} ... [ 1270.488900] to a HARDIRQ-irq-unsafe lock: [ 1270.489327] (&(&lock->wait_lock)->rlock){+.+.} ... [ 1270.494735] Possible interrupt unsafe locking scenario: [ 1270.494735] [ 1270.495250] CPU0 CPU1 [ 1270.495600] ---- ---- [ 1270.495947] lock(&(&lock->wait_lock)->rlock); [ 1270.496295] local_irq_disable(); [ 1270.496753] lock(&pool->lock/1); [ 1270.497205] lock(&(&lock->wait_lock)->rlock); [ 1270.497744] [ 1270.497948] lock(&pool->lock/1); , which will cause a irq inversion deadlock if the above lock scenario happens. The root cause of this safe -> unsafe lock order is the mutex_unlock(pool->manager_arb) in manage_workers() with pool->lock held. Unlocking mutex while holding an irq spinlock was never safe and this problem has been around forever but it never got noticed because the only time the mutex is usually trylocked while holding irqlock making actual failures very unlikely and lockdep annotation missed the condition until the recent b9c16a0e1f73 ("locking/mutex: Fix lockdep_assert_held() fail"). Using mutex for pool->manager_arb has always been a bit of stretch. It primarily is an mechanism to arbitrate managership between workers which can easily be done with a pool flag. The only reason it became a mutex is that pool destruction path wants to exclude parallel managing operations. This patch replaces the mutex with a new pool flag POOL_MANAGER_ACTIVE and make the destruction path wait for the current manager on a wait queue. v2: Drop unnecessary flag clearing before pool destruction as suggested by Boqun. Signed-off-by: Tejun Heo Reported-by: Josef Bacik Reviewed-by: Lai Jiangshan Cc: Peter Zijlstra Cc: Boqun Feng Cc: stable@vger.kernel.org --- kernel/workqueue.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..a2dccfe1acec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,6 +68,7 @@ enum { * attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. */ + POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ /* worker flags */ @@ -165,7 +166,6 @@ struct worker_pool { /* L: hash of busy workers */ /* see manage_workers() for details on the two manager mutexes */ - struct mutex manager_arb; /* manager arbitration */ struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ @@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ +static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ @@ -801,7 +802,7 @@ static bool need_to_create_worker(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = mutex_is_locked(&pool->manager_arb); + bool managing = pool->flags & POOL_MANAGER_ACTIVE; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1980,24 +1981,17 @@ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - /* - * Anyone who successfully grabs manager_arb wins the arbitration - * and becomes the manager. mutex_trylock() on pool->manager_arb - * failure while holding pool->lock reliably indicates that someone - * else is managing the pool and the worker which failed trylock - * can proceed to executing work items. This means that anyone - * grabbing manager_arb is responsible for actually performing - * manager duties. If manager_arb is grabbed and released without - * actual management, the pool may stall indefinitely. - */ - if (!mutex_trylock(&pool->manager_arb)) + if (pool->flags & POOL_MANAGER_ACTIVE) return false; + + pool->flags |= POOL_MANAGER_ACTIVE; pool->manager = worker; maybe_create_worker(pool); pool->manager = NULL; - mutex_unlock(&pool->manager_arb); + pool->flags &= ~POOL_MANAGER_ACTIVE; + wake_up(&wq_manager_wait); return true; } @@ -3248,7 +3242,6 @@ static int init_worker_pool(struct worker_pool *pool) setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); - mutex_init(&pool->manager_arb); mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); @@ -3318,13 +3311,15 @@ static void put_unbound_pool(struct worker_pool *pool) hash_del(&pool->hash_node); /* - * Become the manager and destroy all workers. Grabbing - * manager_arb prevents @pool's workers from blocking on - * attach_mutex. + * Become the manager and destroy all workers. This prevents + * @pool's workers from blocking on attach_mutex. We're the last + * manager and @pool gets freed with the flag set. */ - mutex_lock(&pool->manager_arb); - spin_lock_irq(&pool->lock); + wait_event_lock_irq(wq_manager_wait, + !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); + pool->flags |= POOL_MANAGER_ACTIVE; + while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); @@ -3338,8 +3333,6 @@ static void put_unbound_pool(struct worker_pool *pool) if (pool->detach_completion) wait_for_completion(pool->detach_completion); - mutex_unlock(&pool->manager_arb); - /* shut down the timers */ del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); -- cgit v1.2.3 From 084f5601c357e4ee59cf0712200d3f5c4710ba40 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 29 Sep 2017 14:26:48 +0100 Subject: seccomp: make function __get_seccomp_filter static The function __get_seccomp_filter is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol '__get_seccomp_filter' was not declared. Should it be static? Signed-off-by: Colin Ian King Fixes: 66a733ea6b61 ("seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter()") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index bb3a38005b9c..0ae832e13b97 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } -void __get_seccomp_filter(struct seccomp_filter *filter) +static void __get_seccomp_filter(struct seccomp_filter *filter) { /* Reference count is bounded by the number of total processes. */ refcount_inc(&filter->usage); -- cgit v1.2.3 From e7bf8249e8f1bac64885eeccb55bcf6111901a81 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:10 -0700 Subject: bpf: encapsulate verifier log state into a structure Put the loose log_* variables into a structure. This will make it simpler to remove the global verifier state in following patches. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6352a88ca6d1..e53458b02249 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -156,8 +156,7 @@ struct bpf_call_arg_meta { /* verbose verifier prints what it's seeing * bpf_check() is called under lock, so no race to access these global vars */ -static u32 log_level, log_size, log_len; -static char *log_buf; +static struct bpf_verifer_log verifier_log; static DEFINE_MUTEX(bpf_verifier_lock); @@ -167,13 +166,15 @@ static DEFINE_MUTEX(bpf_verifier_lock); */ static __printf(1, 2) void verbose(const char *fmt, ...) { + struct bpf_verifer_log *log = &verifier_log; va_list args; - if (log_level == 0 || log_len >= log_size - 1) + if (!log->level || bpf_verifier_log_full(log)) return; va_start(args, fmt); - log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + log->len_used += vscnprintf(log->kbuf + log->len_used, + log->len_total - log->len_used, fmt, args); va_end(args); } @@ -886,7 +887,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (log_level) + if (verifier_log.level) print_verifier_state(state); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a @@ -2956,7 +2957,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; } - if (log_level) + if (verifier_log.level) print_verifier_state(this_branch); return 0; } @@ -3712,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (log_level) { + if (verifier_log.level) { if (do_print_state) verbose("\nfrom %d to %d: safe\n", prev_insn_idx, insn_idx); @@ -3725,8 +3726,9 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (log_level > 1 || (log_level && do_print_state)) { - if (log_level > 1) + if (verifier_log.level > 1 || + (verifier_log.level && do_print_state)) { + if (verifier_log.level > 1) verbose("%d:", insn_idx); else verbose("\nfrom %d to %d:", @@ -3735,7 +3737,7 @@ static int do_check(struct bpf_verifier_env *env) do_print_state = false; } - if (log_level) { + if (verifier_log.level) { verbose("%d: ", insn_idx); print_bpf_insn(env, insn); } @@ -4389,7 +4391,7 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - char __user *log_ubuf = NULL; + struct bpf_verifer_log *log = &verifier_log; struct bpf_verifier_env *env; int ret = -EINVAL; @@ -4414,23 +4416,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* user requested verbose verifier output * and supplied buffer to store the verification trace */ - log_level = attr->log_level; - log_ubuf = (char __user *) (unsigned long) attr->log_buf; - log_size = attr->log_size; - log_len = 0; + log->level = attr->log_level; + log->ubuf = (char __user *) (unsigned long) attr->log_buf; + log->len_total = attr->log_size; + log->len_used = 0; ret = -EINVAL; - /* log_* values have to be sane */ - if (log_size < 128 || log_size > UINT_MAX >> 8 || - log_level == 0 || log_ubuf == NULL) + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) goto err_unlock; ret = -ENOMEM; - log_buf = vmalloc(log_size); - if (!log_buf) + log->kbuf = vmalloc(log->len_total); + if (!log->kbuf) goto err_unlock; } else { - log_level = 0; + log->level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -4467,15 +4469,16 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (log_level && log_len >= log_size - 1) { - BUG_ON(log_len >= log_size); + if (log->level && bpf_verifier_log_full(log)) { + BUG_ON(log->len_used >= log->len_total); /* verifier log exceeded user supplied buffer */ ret = -ENOSPC; /* fall through to return what was recorded */ } /* copy verifier log back to user space including trailing zero */ - if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + if (log->level && copy_to_user(log->ubuf, log->kbuf, + log->len_used + 1) != 0) { ret = -EFAULT; goto free_log_buf; } @@ -4502,8 +4505,8 @@ skip_full_check: } free_log_buf: - if (log_level) - vfree(log_buf); + if (log->level) + vfree(log->kbuf); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. @@ -4540,7 +4543,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); - log_level = 0; + verifier_log.level = 0; env->strict_alignment = false; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) -- cgit v1.2.3 From 61bd5218eef349fcacc4976a251bc83a4748b4af Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:11 -0700 Subject: bpf: move global verifier log into verifier environment The biggest piece of global state protected by the verifier lock is the verifier_log. Move that log to struct bpf_verifier_env. struct bpf_verifier_env has to be passed now to all invocations of verbose(). Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 491 ++++++++++++++++++++++++++------------------------ 1 file changed, 259 insertions(+), 232 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e53458b02249..a352f93cd4b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,20 +153,16 @@ struct bpf_call_arg_meta { int access_size; }; -/* verbose verifier prints what it's seeing - * bpf_check() is called under lock, so no race to access these global vars - */ -static struct bpf_verifer_log verifier_log; - static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static __printf(1, 2) void verbose(const char *fmt, ...) +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) { - struct bpf_verifer_log *log = &verifier_log; + struct bpf_verifer_log *log = &env->log; va_list args; if (!log->level || bpf_verifier_log_full(log)) @@ -214,7 +210,8 @@ static const char *func_id_name(int id) return "unknown"; } -static void print_verifier_state(struct bpf_verifier_state *state) +static void print_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *state) { struct bpf_reg_state *reg; enum bpf_reg_type t; @@ -225,21 +222,21 @@ static void print_verifier_state(struct bpf_verifier_state *state) t = reg->type; if (t == NOT_INIT) continue; - verbose(" R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d=%s", i, reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ - verbose("%lld", reg->var_off.value + reg->off); + verbose(env, "%lld", reg->var_off.value + reg->off); } else { - verbose("(id=%d", reg->id); + verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) - verbose(",off=%d", reg->off); + verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) - verbose(",r=%d", reg->range); + verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) - verbose(",ks=%d,vs=%d", + verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { @@ -247,38 +244,38 @@ static void print_verifier_state(struct bpf_verifier_state *state) * could be a pointer whose offset is too big * for reg->off */ - verbose(",imm=%llx", reg->var_off.value); + verbose(env, ",imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(",smin_value=%lld", + verbose(env, ",smin_value=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(",smax_value=%lld", + verbose(env, ",smax_value=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(",umin_value=%llu", + verbose(env, ",umin_value=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(",umax_value=%llu", + verbose(env, ",umax_value=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(",var_off=%s", tn_buf); + verbose(env, ",var_off=%s", tn_buf); } } - verbose(")"); + verbose(env, ")"); } } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) - verbose(" fp%d=%s", -MAX_BPF_STACK + i, + verbose(env, " fp%d=%s", -MAX_BPF_STACK + i, reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); } - verbose("\n"); + verbose(env, "\n"); } static const char *const bpf_class_string[] = { @@ -333,15 +330,15 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_end_insn(const struct bpf_verifier_env *env, +static void print_bpf_end_insn(struct bpf_verifier_env *env, const struct bpf_insn *insn) { - verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } -static void print_bpf_insn(const struct bpf_verifier_env *env, +static void print_bpf_insn(struct bpf_verifier_env *env, const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); @@ -349,23 +346,23 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose("BUG_alu64_%02x\n", insn->code); + verbose(env, "BUG_alu64_%02x\n", insn->code); else print_bpf_end_insn(env, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose("(%02x) r%d = %s-r%d\n", + verbose(env, "(%02x) r%d = %s-r%d\n", insn->code, insn->dst_reg, class == BPF_ALU ? "(u32) " : "", insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) %sr%d %s %sr%d\n", + verbose(env, "(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); } else { - verbose("(%02x) %sr%d %s %s%d\n", + verbose(env, "(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], @@ -374,46 +371,46 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) - verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_XADD) - verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else - verbose("BUG_%02x\n", insn->code); + verbose(env, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_st_%02x\n", insn->code); + verbose(env, "BUG_st_%02x\n", insn->code); return; } - verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_ldx_%02x\n", insn->code); + verbose(env, "BUG_ldx_%02x\n", insn->code); return; } - verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { - verbose("(%02x) r0 = *(%s *)skb[%d]\n", + verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); @@ -428,36 +425,37 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, if (map_ptr && !env->allow_ptr_leaks) imm = 0; - verbose("(%02x) r%d = 0x%llx\n", insn->code, + verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, insn->dst_reg, (unsigned long long)imm); } else { - verbose("BUG_ld_%02x\n", insn->code); + verbose(env, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose("(%02x) call %s#%d\n", insn->code, + verbose(env, "(%02x) call %s#%d\n", insn->code, func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose("(%02x) goto pc%+d\n", + verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose("(%02x) exit\n", insn->code); + verbose(env, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) if r%d %s r%d goto pc%+d\n", + verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->src_reg, insn->off); } else { - verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } } else { - verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + verbose(env, "(%02x) %s\n", + insn->code, bpf_class_string[class]); } } @@ -496,7 +494,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->head = elem; env->stack_size++; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose("BPF program is too complex\n"); + verbose(env, "BPF program is too complex\n"); goto err; } return &elem->st; @@ -534,10 +532,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_known_zero(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_known_zero(regs, %u)\n", regno); + verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -647,10 +646,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) __mark_reg_unbounded(reg); } -static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_unknown(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_unknown(regs, %u)\n", regno); + verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -665,10 +665,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg) reg->type = NOT_INIT; } -static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_not_init(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_not_init(regs, %u)\n", regno); + verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -677,22 +678,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) __mark_reg_not_init(regs + regno); } -static void init_reg_state(struct bpf_reg_state *regs) +static void init_reg_state(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { - mark_reg_not_init(regs, i); + mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; } /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; - mark_reg_known_zero(regs, BPF_REG_FP); + mark_reg_known_zero(env, regs, BPF_REG_FP); /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(regs, BPF_REG_1); + mark_reg_known_zero(env, regs, BPF_REG_1); } enum reg_arg_type { @@ -726,26 +728,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *regs = env->cur_state.regs; if (regno >= MAX_BPF_REG) { - verbose("R%d is invalid\n", regno); + verbose(env, "R%d is invalid\n", regno); return -EINVAL; } if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (regs[regno].type == NOT_INIT) { - verbose("R%d !read_ok\n", regno); + verbose(env, "R%d !read_ok\n", regno); return -EACCES; } mark_reg_read(&env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { - verbose("frame pointer is read only\n"); + verbose(env, "frame pointer is read only\n"); return -EACCES; } regs[regno].live |= REG_LIVE_WRITTEN; if (t == DST_OP) - mark_reg_unknown(regs, regno); + mark_reg_unknown(env, regs, regno); } return 0; } @@ -770,7 +772,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_state *state, int off, +static int check_stack_write(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; @@ -783,7 +786,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off, /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } @@ -818,7 +821,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo } } -static int check_stack_read(struct bpf_verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { u8 *slot_type; @@ -828,12 +832,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, if (slot_type[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { if (slot_type[i] != STACK_SPILL) { - verbose("corrupted spill memory\n"); + verbose(env, "corrupted spill memory\n"); return -EACCES; } } @@ -849,14 +853,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } else { for (i = 0; i < size; i++) { if (slot_type[i] != STACK_MISC) { - verbose("invalid read from stack off %d+%d size %d\n", + verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -868,7 +872,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, struct bpf_map *map = env->cur_state.regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { - verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; } @@ -887,8 +891,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (verifier_log.level) - print_verifier_state(state); + if (env->log.level) + print_verifier_state(env, state); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our @@ -896,13 +900,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * will have a set floor within our range. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->smin_value + off, size); if (err) { - verbose("R%d min value is outside of the array range\n", regno); + verbose(env, "R%d min value is outside of the array range\n", + regno); return err; } @@ -911,13 +916,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->umax_value + off, size); if (err) - verbose("R%d max value is outside of the array range\n", regno); + verbose(env, "R%d max value is outside of the array range\n", + regno); return err; } @@ -956,7 +962,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { - verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } @@ -979,13 +985,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, * detail to prove they're safe. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_packet_access(env, regno, off, size); if (err) { - verbose("R%d offset is outside of the packet\n", regno); + verbose(env, "R%d offset is outside of the packet\n", regno); return err; } return err; @@ -1021,7 +1027,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return 0; } - verbose("invalid bpf_context access off=%d size=%d\n", off, size); + verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; } @@ -1039,7 +1045,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); } -static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, +static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int off, int size, bool strict) { struct tnum reg_off; @@ -1064,7 +1071,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned packet access off %d+%s+%d+%d size %d\n", + verbose(env, + "misaligned packet access off %d+%s+%d+%d size %d\n", ip_align, tn_buf, reg->off, off, size); return -EACCES; } @@ -1072,7 +1080,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, return 0; } -static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, +static int check_generic_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, const char *pointer_desc, int off, int size, bool strict) { @@ -1087,7 +1096,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned %saccess off %s+%d+%d size %d\n", + verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", pointer_desc, tn_buf, reg->off, off, size); return -EACCES; } @@ -1108,7 +1117,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, /* Special case, because of NET_IP_ALIGN. Given metadata sits * right in front, treat it the very same way. */ - return check_pkt_ptr_alignment(reg, off, size, strict); + return check_pkt_ptr_alignment(env, reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1121,7 +1130,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, default: break; } - return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); + return check_generic_ptr_alignment(env, reg, pointer_desc, off, size, + strict); } /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -1153,20 +1163,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into map\n", value_regno); + verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into ctx\n", value_regno); + verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } /* ctx accesses must be at a fixed offset, so that we can @@ -1176,7 +1186,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable ctx access var_off=%s off=%d size=%d", + verbose(env, + "variable ctx access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } @@ -1188,9 +1199,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); else - mark_reg_known_zero(state->regs, value_regno); + mark_reg_known_zero(env, state->regs, + value_regno); state->regs[value_regno].id = 0; state->regs[value_regno].off = 0; state->regs[value_regno].range = 0; @@ -1206,13 +1218,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } off += reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK) { - verbose("invalid stack off=%d size=%d\n", off, size); + verbose(env, "invalid stack off=%d size=%d\n", off, + size); return -EACCES; } @@ -1223,29 +1236,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!env->allow_ptr_leaks && state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && size != BPF_REG_SIZE) { - verbose("attempt to corrupt spilled pointer on stack\n"); + verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; } - err = check_stack_write(state, off, size, value_regno); + err = check_stack_write(env, state, off, size, + value_regno); } else { - err = check_stack_read(state, off, size, value_regno); + err = check_stack_read(env, state, off, size, + value_regno); } } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { - verbose("cannot write into packet\n"); + verbose(env, "cannot write into packet\n"); return -EACCES; } if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into packet\n", value_regno); + verbose(env, "R%d leaks addr into packet\n", + value_regno); return -EACCES; } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); } else { - verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[reg->type]); + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str[reg->type]); return -EACCES; } @@ -1265,7 +1281,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || insn->imm != 0) { - verbose("BPF_XADD uses reserved fields\n"); + verbose(env, "BPF_XADD uses reserved fields\n"); return -EINVAL; } @@ -1280,7 +1296,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d leaks addr into mem\n", insn->src_reg); + verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; } @@ -1321,7 +1337,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, register_is_null(regs[regno])) return 0; - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[regs[regno].type], reg_type_str[PTR_TO_STACK]); return -EACCES; @@ -1332,13 +1348,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); - verbose("invalid variable stack read R%d var_off=%s\n", + verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size <= 0) { - verbose("invalid stack type R%d off=%d access_size=%d\n", + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; } @@ -1354,7 +1370,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, for (i = 0; i < access_size; i++) { if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { - verbose("invalid indirect read from stack off %d+%d size %d\n", + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; } @@ -1397,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_ANYTHING) { if (is_pointer_value(env, regno)) { - verbose("R%d leaks addr into helper function\n", regno); + verbose(env, "R%d leaks addr into helper function\n", + regno); return -EACCES; } return 0; @@ -1405,7 +1422,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { - verbose("helper access to the packet is not allowed\n"); + verbose(env, "helper access to the packet is not allowed\n"); return -EACCES; } @@ -1443,7 +1460,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { - verbose("unsupported arg_type %d\n", arg_type); + verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; } @@ -1461,7 +1478,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->key\n"); + verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } if (type_is_pkt_pointer(type)) @@ -1477,7 +1494,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->value\n"); + verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } if (type_is_pkt_pointer(type)) @@ -1497,7 +1514,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_SIZE cannot be first argument\n"); + verbose(env, + "ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } @@ -1514,7 +1532,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta = NULL; if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", regno); return -EACCES; } @@ -1528,7 +1546,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", regno); return -EACCES; } @@ -1539,12 +1557,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return err; err_type: - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[type], reg_type_str[expected_type]); return -EACCES; } -static int check_map_func_compatibility(struct bpf_map *map, int func_id) +static int check_map_func_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, int func_id) { if (!map) return 0; @@ -1632,7 +1651,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %s#%d\n", + verbose(env, "cannot pass map_type %d into func %s#%d\n", map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1666,7 +1685,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) for (i = 0; i < MAX_BPF_REG; i++) if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(regs, i); + mark_reg_unknown(env, regs, i); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) @@ -1688,7 +1707,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "invalid func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } @@ -1696,13 +1716,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "unknown func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose("cannot call GPL only function from proprietary program\n"); + verbose(env, "cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -1716,7 +1737,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } @@ -1749,14 +1770,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { @@ -1764,14 +1785,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ - mark_reg_known_zero(regs, BPF_REG_0); + mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose("kernel subsystem misconfigured verifier\n"); + verbose(env, + "kernel subsystem misconfigured verifier\n"); return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; @@ -1782,12 +1804,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) else if (insn_aux->map_ptr != meta.map_ptr) insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { - verbose("unknown return type %d of func %s#%d\n", + verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } - err = check_map_func_compatibility(meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; @@ -1846,39 +1868,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad sbounds\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, + "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad ubounds\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, + "verifier internal error: known but bad ubounds\n"); return -EINVAL; } if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ if (!env->allow_ptr_leaks) - verbose("R%d 32-bit pointer arithmetic prohibited\n", + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", dst); return -EACCES; } @@ -1943,7 +1968,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ if (!env->allow_ptr_leaks) - verbose("R%d tried to subtract pointer from scalar\n", + verbose(env, "R%d tried to subtract pointer from scalar\n", dst); return -EACCES; } @@ -1953,7 +1978,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, */ if (ptr_reg->type == PTR_TO_STACK) { if (!env->allow_ptr_leaks) - verbose("R%d subtraction from stack pointer prohibited\n", + verbose(env, "R%d subtraction from stack pointer prohibited\n", dst); return -EACCES; } @@ -2008,13 +2033,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * ptr &= ~3 which would reduce min_value by 3.) */ if (!env->allow_ptr_leaks) - verbose("R%d bitwise operator %s on pointer prohibited\n", + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic with %s operator prohibited\n", + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; } @@ -2180,7 +2205,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* We lose all sign bit information (except what we can pick @@ -2208,7 +2233,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* BPF_RSH is an unsigned shift, so make the appropriate casts */ @@ -2236,7 +2261,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; default: - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } @@ -2268,12 +2293,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. */ if (!env->allow_ptr_leaks) { - verbose("R%d pointer %s pointer prohibited\n", + verbose(env, "R%d pointer %s pointer prohibited\n", insn->dst_reg, bpf_alu_string[opcode >> 4]); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); return 0; } else { /* scalar += pointer @@ -2325,13 +2350,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: unexpected ptr_reg\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: no src_reg\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); @@ -2349,14 +2374,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) != 0 || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { - verbose("BPF_NEG uses reserved fields\n"); + verbose(env, "BPF_NEG uses reserved fields\n"); return -EINVAL; } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || BPF_CLASS(insn->code) == BPF_ALU64) { - verbose("BPF_END uses reserved fields\n"); + verbose(env, "BPF_END uses reserved fields\n"); return -EINVAL; } } @@ -2367,7 +2392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer arithmetic prohibited\n", + verbose(env, "R%d pointer arithmetic prohibited\n", insn->dst_reg); return -EACCES; } @@ -2381,7 +2406,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -2391,7 +2416,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } } @@ -2411,11 +2436,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d partial copy of pointer\n", + verbose(env, + "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); /* high 32 bits are known zero. */ regs[insn->dst_reg].var_off = tnum_cast( regs[insn->dst_reg].var_off, 4); @@ -2430,14 +2456,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else if (opcode > BPF_END) { - verbose("invalid BPF_ALU opcode %x\n", opcode); + verbose(env, "invalid BPF_ALU opcode %x\n", opcode); return -EINVAL; } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } /* check src1 operand */ @@ -2446,7 +2472,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } } @@ -2458,7 +2484,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if ((opcode == BPF_MOD || opcode == BPF_DIV) && BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { - verbose("div by zero\n"); + verbose(env, "div by zero\n"); return -EINVAL; } @@ -2467,7 +2493,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; if (insn->imm < 0 || insn->imm >= size) { - verbose("invalid shift %d\n", insn->imm); + verbose(env, "invalid shift %d\n", insn->imm); return -EINVAL; } } @@ -2820,13 +2846,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, int err; if (opcode > BPF_JSLE) { - verbose("invalid BPF_JMP opcode %x\n", opcode); + verbose(env, "invalid BPF_JMP opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } @@ -2836,13 +2862,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d pointer comparison prohibited\n", + verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg); return -EACCES; } } else { if (insn->src_reg != BPF_REG_0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } } @@ -2954,11 +2980,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, find_good_pkt_pointers(this_branch, ®s[insn->src_reg], PTR_TO_PACKET_META); } else if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + verbose(env, "R%d pointer comparison prohibited\n", + insn->dst_reg); return -EACCES; } - if (verifier_log.level) - print_verifier_state(this_branch); + if (env->log.level) + print_verifier_state(env, this_branch); return 0; } @@ -2977,11 +3004,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) int err; if (BPF_SIZE(insn->code) != BPF_DW) { - verbose("invalid BPF_LD_IMM insn\n"); + verbose(env, "invalid BPF_LD_IMM insn\n"); return -EINVAL; } if (insn->off != 0) { - verbose("BPF_LD_IMM64 uses reserved fields\n"); + verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); return -EINVAL; } @@ -3039,14 +3066,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); + verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); + verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -3056,7 +3083,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (regs[BPF_REG_6].type != PTR_TO_CTX) { - verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + verbose(env, + "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; } @@ -3069,7 +3097,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -3077,7 +3105,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * the value fetched from the packet. * Already marked as written above. */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); return 0; } @@ -3097,22 +3125,22 @@ static int check_return_code(struct bpf_verifier_env *env) reg = &env->cur_state.regs[BPF_REG_0]; if (reg->type != SCALAR_VALUE) { - verbose("At program exit the register R0 is not a known value (%s)\n", + verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); return -EINVAL; } if (!tnum_in(range, reg->var_off)) { - verbose("At program exit the register R0 "); + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("has value %s", tn_buf); + verbose(env, "has value %s", tn_buf); } else { - verbose("has unknown scalar value"); + verbose(env, "has unknown scalar value"); } - verbose(" should have been 0 or 1\n"); + verbose(env, " should have been 0 or 1\n"); return -EINVAL; } return 0; @@ -3178,7 +3206,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { - verbose("jump out of range from insn %d to %d\n", t, w); + verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -3195,13 +3223,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - verbose("back-edge from insn %d to %d\n", t, w); + verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose("insn state internal bug\n"); + verbose(env, "insn state internal bug\n"); return -EFAULT; } return 0; @@ -3295,7 +3323,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; if (cur_stack-- <= 0) { - verbose("pop stack internal bug\n"); + verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; } @@ -3304,7 +3332,7 @@ mark_explored: check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { - verbose("unreachable insn %d\n", i); + verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } @@ -3685,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env) int insn_processed = 0; bool do_print_state = false; - init_reg_state(regs); + init_reg_state(env, regs); state->parent = NULL; insn_idx = 0; for (;;) { @@ -3694,7 +3722,7 @@ static int do_check(struct bpf_verifier_env *env) int err; if (insn_idx >= insn_cnt) { - verbose("invalid insn idx %d insn_cnt %d\n", + verbose(env, "invalid insn idx %d insn_cnt %d\n", insn_idx, insn_cnt); return -EFAULT; } @@ -3703,7 +3731,8 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose("BPF program is too large. Processed %d insn\n", + verbose(env, + "BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -3713,12 +3742,12 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (verifier_log.level) { + if (env->log.level) { if (do_print_state) - verbose("\nfrom %d to %d: safe\n", + verbose(env, "\nfrom %d to %d: safe\n", prev_insn_idx, insn_idx); else - verbose("%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", insn_idx); } goto process_bpf_exit; } @@ -3726,19 +3755,18 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (verifier_log.level > 1 || - (verifier_log.level && do_print_state)) { - if (verifier_log.level > 1) - verbose("%d:", insn_idx); + if (env->log.level > 1 || (env->log.level && do_print_state)) { + if (env->log.level > 1) + verbose(env, "%d:", insn_idx); else - verbose("\nfrom %d to %d:", + verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(&env->cur_state); + print_verifier_state(env, &env->cur_state); do_print_state = false; } - if (verifier_log.level) { - verbose("%d: ", insn_idx); + if (env->log.level) { + verbose(env, "%d: ", insn_idx); print_bpf_insn(env, insn); } @@ -3795,7 +3823,7 @@ static int do_check(struct bpf_verifier_env *env) * src_reg == stack|map in some other branch. * Reject it. */ - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -3835,14 +3863,14 @@ static int do_check(struct bpf_verifier_env *env) } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || *prev_dst_type == PTR_TO_CTX)) { - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { - verbose("BPF_ST uses reserved fields\n"); + verbose(env, "BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ @@ -3865,7 +3893,7 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_CALL uses reserved fields\n"); + verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } @@ -3878,7 +3906,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_JA uses reserved fields\n"); + verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -3890,7 +3918,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_EXIT uses reserved fields\n"); + verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } @@ -3905,7 +3933,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_pointer_value(env, BPF_REG_0)) { - verbose("R0 leaks addr as return value\n"); + verbose(env, "R0 leaks addr as return value\n"); return -EACCES; } @@ -3940,19 +3968,19 @@ process_bpf_exit: insn_idx++; } else { - verbose("invalid BPF_LD mode\n"); + verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; } } else { - verbose("unknown insn class %d\n", class); + verbose(env, "unknown insn class %d\n", class); return -EINVAL; } insn_idx++; } - verbose("processed %d insns, stack depth %d\n", - insn_processed, env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth %d\n", insn_processed, + env->prog->aux->stack_depth); return 0; } @@ -3964,7 +3992,8 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } -static int check_map_prog_compatibility(struct bpf_map *map, +static int check_map_prog_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, struct bpf_prog *prog) { @@ -3975,12 +4004,12 @@ static int check_map_prog_compatibility(struct bpf_map *map, */ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { if (!check_map_prealloc(map)) { - verbose("perf_event programs can only use preallocated hash map\n"); + verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) { - verbose("perf_event programs can only use preallocated inner hash map\n"); + verbose(env, "perf_event programs can only use preallocated inner hash map\n"); return -EINVAL; } } @@ -4003,14 +4032,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { - verbose("BPF_LDX uses reserved fields\n"); + verbose(env, "BPF_LDX uses reserved fields\n"); return -EINVAL; } if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose("BPF_STX uses reserved fields\n"); + verbose(env, "BPF_STX uses reserved fields\n"); return -EINVAL; } @@ -4021,7 +4050,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { - verbose("invalid bpf_ld_imm64 insn\n"); + verbose(env, "invalid bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -4030,19 +4059,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) goto next_insn; if (insn->src_reg != BPF_PSEUDO_MAP_FD) { - verbose("unrecognized bpf_ld_imm64 insn\n"); + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } f = fdget(insn->imm); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose("fd %d is not pointing to valid bpf_map\n", + verbose(env, "fd %d is not pointing to valid bpf_map\n", insn->imm); return PTR_ERR(map); } - err = check_map_prog_compatibility(map, env->prog); + err = check_map_prog_compatibility(env, map, env->prog); if (err) { fdput(f); return err; @@ -4164,7 +4194,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); @@ -4212,7 +4242,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose("bpf verifier narrow ctx access misconfigured\n"); + verbose(env, "bpf verifier narrow ctx access misconfigured\n"); return -EINVAL; } @@ -4231,7 +4261,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4313,7 +4343,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4357,7 +4387,8 @@ patch_call_imm: * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, + "kernel subsystem misconfigured func %s#%d\n", func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -4391,8 +4422,8 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - struct bpf_verifer_log *log = &verifier_log; struct bpf_verifier_env *env; + struct bpf_verifer_log *log; int ret = -EINVAL; /* 'struct bpf_verifier_env' can be global, but since it's not small, @@ -4401,6 +4432,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; + log = &env->log; env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * (*prog)->len); @@ -4419,7 +4451,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log->level = attr->log_level; log->ubuf = (char __user *) (unsigned long) attr->log_buf; log->len_total = attr->log_size; - log->len_used = 0; ret = -EINVAL; /* log attributes have to be sane */ @@ -4431,8 +4462,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log->kbuf = vmalloc(log->len_total); if (!log->kbuf) goto err_unlock; - } else { - log->level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -4543,8 +4572,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); - verifier_log.level = 0; - env->strict_alignment = false; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; -- cgit v1.2.3 From f4ac7e0b5cc8d16004ac57ff679266d573f30f77 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:12 -0700 Subject: bpf: move instruction printing into a separate file Separate the instruction printing into a standalone source file. This way sneaky code from tools/ can compile it in directly. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 1 + kernel/bpf/disasm.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/disasm.h | 32 ++++++++ kernel/bpf/verifier.c | 202 +---------------------------------------------- 4 files changed, 251 insertions(+), 198 deletions(-) create mode 100644 kernel/bpf/disasm.c create mode 100644 kernel/bpf/disasm.h (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 897daa005b23..53fb09f92e3f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,6 +2,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o ifeq ($(CONFIG_STREAM_PARSER),y) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c new file mode 100644 index 000000000000..e682850c9715 --- /dev/null +++ b/kernel/bpf/disasm.c @@ -0,0 +1,214 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include + +#include "disasm.h" + +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + +const char *const bpf_class_string[8] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +const char *const bpf_alu_string[16] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[16] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JLT >> 4] = "<", + [BPF_JGE >> 4] = ">=", + [BPF_JLE >> 4] = "<=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSLT >> 4] = "s<", + [BPF_JSGE >> 4] = "s>=", + [BPF_JSLE >> 4] = "s<=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_end_insn(bpf_insn_print_cb verbose, + struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose(env, "BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(verbose, env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose(env, "(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + } else { + verbose(env, "(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose(env, "BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_st_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_ldx_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !allow_ptr_leaks) + imm = 0; + + verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); + } else { + verbose(env, "BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose(env, "(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose(env, "(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose(env, "(%02x) %s\n", + insn->code, bpf_class_string[class]); + } +} diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h new file mode 100644 index 000000000000..8de977e420b6 --- /dev/null +++ b/kernel/bpf/disasm.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef __BPF_DISASM_H__ +#define __BPF_DISASM_H__ + +#include +#include +#include + +extern const char *const bpf_alu_string[16]; +extern const char *const bpf_class_string[8]; + +const char *func_id_name(int id); + +struct bpf_verifier_env; +typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, + const char *, ...); +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks); + +#endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a352f93cd4b2..274c6582ec39 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,8 @@ #include #include +#include "disasm.h" + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -194,22 +196,6 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; -#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) -static const char * const func_id_str[] = { - __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) -}; -#undef __BPF_FUNC_STR_FN - -static const char *func_id_name(int id) -{ - BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - - if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) - return func_id_str[id]; - else - return "unknown"; -} - static void print_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *state) { @@ -278,187 +264,6 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -static const char *const bpf_class_string[] = { - [BPF_LD] = "ld", - [BPF_LDX] = "ldx", - [BPF_ST] = "st", - [BPF_STX] = "stx", - [BPF_ALU] = "alu", - [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", - [BPF_ALU64] = "alu64", -}; - -static const char *const bpf_alu_string[16] = { - [BPF_ADD >> 4] = "+=", - [BPF_SUB >> 4] = "-=", - [BPF_MUL >> 4] = "*=", - [BPF_DIV >> 4] = "/=", - [BPF_OR >> 4] = "|=", - [BPF_AND >> 4] = "&=", - [BPF_LSH >> 4] = "<<=", - [BPF_RSH >> 4] = ">>=", - [BPF_NEG >> 4] = "neg", - [BPF_MOD >> 4] = "%=", - [BPF_XOR >> 4] = "^=", - [BPF_MOV >> 4] = "=", - [BPF_ARSH >> 4] = "s>>=", - [BPF_END >> 4] = "endian", -}; - -static const char *const bpf_ldst_string[] = { - [BPF_W >> 3] = "u32", - [BPF_H >> 3] = "u16", - [BPF_B >> 3] = "u8", - [BPF_DW >> 3] = "u64", -}; - -static const char *const bpf_jmp_string[16] = { - [BPF_JA >> 4] = "jmp", - [BPF_JEQ >> 4] = "==", - [BPF_JGT >> 4] = ">", - [BPF_JLT >> 4] = "<", - [BPF_JGE >> 4] = ">=", - [BPF_JLE >> 4] = "<=", - [BPF_JSET >> 4] = "&", - [BPF_JNE >> 4] = "!=", - [BPF_JSGT >> 4] = "s>", - [BPF_JSLT >> 4] = "s<", - [BPF_JSGE >> 4] = "s>=", - [BPF_JSLE >> 4] = "s<=", - [BPF_CALL >> 4] = "call", - [BPF_EXIT >> 4] = "exit", -}; - -static void print_bpf_end_insn(struct bpf_verifier_env *env, - const struct bpf_insn *insn) -{ - verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, - BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", - insn->imm, insn->dst_reg); -} - -static void print_bpf_insn(struct bpf_verifier_env *env, - const struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - - if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_OP(insn->code) == BPF_END) { - if (class == BPF_ALU64) - verbose(env, "BUG_alu64_%02x\n", insn->code); - else - print_bpf_end_insn(env, insn); - } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(env, "(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", - insn->dst_reg); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->src_reg); - } else { - verbose(env, "(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->imm); - } - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_MEM) - verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, insn->off, - insn->src_reg); - else - verbose(env, "BUG_%02x\n", insn->code); - } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_st_%02x\n", insn->code); - return; - } - verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); - } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_ldx_%02x\n", insn->code); - return; - } - verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", - insn->code, insn->dst_reg, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->off); - } else if (class == BPF_LD) { - if (BPF_MODE(insn->code) == BPF_ABS) { - verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM && - BPF_SIZE(insn->code) == BPF_DW) { - /* At this point, we already made sure that the second - * part of the ldimm64 insn is accessible. - */ - u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; - - if (map_ptr && !env->allow_ptr_leaks) - imm = 0; - - verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); - } else { - verbose(env, "BUG_ld_%02x\n", insn->code); - return; - } - } else if (class == BPF_JMP) { - u8 opcode = BPF_OP(insn->code); - - if (opcode == BPF_CALL) { - verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); - } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose(env, "(%02x) goto pc%+d\n", - insn->code, insn->off); - } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose(env, "(%02x) exit\n", insn->code); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->src_reg, insn->off); - } else { - verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); - } - } else { - verbose(env, "(%02x) %s\n", - insn->code, bpf_class_string[class]); - } -} - static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) { struct bpf_verifier_stack_elem *elem; @@ -3767,7 +3572,8 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { verbose(env, "%d: ", insn_idx); - print_bpf_insn(env, insn); + print_bpf_insn(verbose, env, insn, + env->allow_ptr_leaks); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); -- cgit v1.2.3 From a2a7d5701052542cd2260e7659b12443e0a74733 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:15 -0700 Subject: bpf: write back the verifier log buffer as it gets filled Verifier log buffer can be quite large (up to 16MB currently). As Eric Dumazet points out if we allow multiple verification requests to proceed simultaneously, malicious user may use the verifier as a way of allocating large amounts of unswappable memory to OOM the host. Switch to a strategy of allocating a smaller buffer (1024B) and writing it out into the user buffer after every print. While at it remove the old BUG_ON(). This is in preparation of the global verifier lock removal. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 274c6582ec39..2cdbcc4f8f6b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -165,15 +165,26 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env, const char *fmt, ...) { struct bpf_verifer_log *log = &env->log; + unsigned int n; va_list args; - if (!log->level || bpf_verifier_log_full(log)) + if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) return; va_start(args, fmt); - log->len_used += vscnprintf(log->kbuf + log->len_used, - log->len_total - log->len_used, fmt, args); + n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); va_end(args); + + WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, + "verifier log line truncated - local buffer too short\n"); + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; + + if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) + log->len_used += n; + else + log->ubuf = NULL; } static bool type_is_pkt_pointer(enum bpf_reg_type type) @@ -4263,11 +4274,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || !log->level || !log->ubuf) goto err_unlock; - - ret = -ENOMEM; - log->kbuf = vmalloc(log->len_total); - if (!log->kbuf) - goto err_unlock; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -4304,18 +4310,11 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (log->level && bpf_verifier_log_full(log)) { - BUG_ON(log->len_used >= log->len_total); - /* verifier log exceeded user supplied buffer */ + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; - /* fall through to return what was recorded */ - } - - /* copy verifier log back to user space including trailing zero */ - if (log->level && copy_to_user(log->ubuf, log->kbuf, - log->len_used + 1) != 0) { + if (log->level && !log->ubuf) { ret = -EFAULT; - goto free_log_buf; + goto err_release_maps; } if (ret == 0 && env->used_map_cnt) { @@ -4326,7 +4325,7 @@ skip_full_check: if (!env->prog->aux->used_maps) { ret = -ENOMEM; - goto free_log_buf; + goto err_release_maps; } memcpy(env->prog->aux->used_maps, env->used_maps, @@ -4339,9 +4338,7 @@ skip_full_check: convert_pseudo_ld_imm64(env); } -free_log_buf: - if (log->level) - vfree(log->kbuf); +err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. -- cgit v1.2.3 From d59158162e032917a428704160a2063a02405ec6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 10 Oct 2017 15:51:37 -0700 Subject: tracing: Add support for preempt and irq enable/disable events Preempt and irq trace events can be used for tracing the start and end of an atomic section which can be used by a trace viewer like systrace to graphically view the start and end of an atomic section and correlate them with latencies and scheduling issues. This also serves as a prelude to using synthetic events or probes to rewrite the preempt and irqsoff tracers, along with numerous benefits of using trace events features for these events. Link: http://lkml.kernel.org/r/20171006005432.14244-3-joelaf@google.com Link: http://lkml.kernel.org/r/20171010225137.17370-1-joelaf@google.com Cc: Peter Zilstra Cc: kernel-team@android.com Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 11 +++++++++++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 35 ++++++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 434c840e2d82..b8395a020821 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -160,6 +160,17 @@ config FUNCTION_GRAPH_TRACER address on the current task structure into a stack of calls. +config PREEMPTIRQ_EVENTS + bool "Enable trace events for preempt and irq disable/enable" + select TRACE_IRQFLAGS + depends on DEBUG_PREEMPT || !PROVE_LOCKING + default n + help + Enable tracing of disable and enable events for preemption and irqs. + For tracing preempt disable/enable events, DEBUG_PREEMPT must be + enabled. For tracing irq disable/enable events, PROVE_LOCKING must + be disabled. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 90f2701d92a7..9f62eee61f14 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 0e3033c00474..03ecb4465ee4 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,9 @@ #include "trace.h" +#define CREATE_TRACE_POINTS +#include + #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -777,26 +780,53 @@ static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } #endif #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + void trace_hardirqs_on(void) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_off(); } EXPORT_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_on_caller(caller_addr); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_off_caller(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -818,14 +848,17 @@ inline void print_irqtrace_events(struct task_struct *curr) } #endif -#ifdef CONFIG_PREEMPT_TRACER +#if defined(CONFIG_PREEMPT_TRACER) || \ + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) void trace_preempt_on(unsigned long a0, unsigned long a1) { + trace_preempt_enable_rcuidle(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { + trace_preempt_disable_rcuidle(a0, a1); tracer_preempt_off(a0, a1); } #endif -- cgit v1.2.3 From 8715b108cd75523c9b2e833cdcd7aeb363767f95 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 9 Oct 2017 12:29:31 -0700 Subject: ftrace: Clear hashes of stale ips of init memory Filters should be cleared of init functions during freeing of init memory when the ftrace dyn records are released. However in current code, the filters are left as is. This patch clears the hashes of the saved init functions when the init memory is freed. This fixes the following issue reproducible with the following sequence of commands for a test module: ================================================ void bar(void) { printk(KERN_INFO "bar!\n"); } void foo(void) { printk(KERN_INFO "foo!\n"); bar(); } static int __init hello_init(void) { printk(KERN_INFO "Hello world!\n"); foo(); return 0; } static void __exit hello_cleanup(void) { printk(KERN_INFO "Cleaning up module.\n"); } module_init(hello_init); module_exit(hello_cleanup); ================================================ Commands: echo '*:mod:test' > /d/tracing/set_ftrace_filter echo function > /d/tracing/current_tracer modprobe test rmmod test sleep 1 modprobe test cat /d/tracing/set_ftrace_filter Behavior without patch: Init function is still in the filter Expected behavior: Shouldn't have any of the filters set Link: http://lkml.kernel.org/r/20171009192931.56401-1-joelaf@google.com Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e99bd55732e..e0a98225666b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6067,6 +6067,63 @@ allocate_ftrace_mod_map(struct module *mod, } #endif /* CONFIG_MODULES */ +struct ftrace_init_func { + struct list_head list; + unsigned long ip; +}; + +/* Clear any init ips from hashes */ +static void +clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + + if (ftrace_hash_empty(hash)) + return; + + entry = __ftrace_lookup_ip(hash, func->ip); + + /* + * Do not allow this rec to match again. + * Yeah, it may waste some memory, but will be removed + * if/when the hash is modified again. + */ + if (entry) + entry->ip = 0; +} + +static void +clear_func_from_hashes(struct ftrace_init_func *func) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->ops || !tr->ops->func_hash) + continue; + mutex_lock(&tr->ops->func_hash->regex_lock); + clear_func_from_hash(func, tr->ops->func_hash->filter_hash); + clear_func_from_hash(func, tr->ops->func_hash->notrace_hash); + mutex_unlock(&tr->ops->func_hash->regex_lock); + } + mutex_unlock(&trace_types_lock); +} + +static void add_to_clear_hash_list(struct list_head *clear_list, + struct dyn_ftrace *rec) +{ + struct ftrace_init_func *func; + + func = kmalloc(sizeof(*func), GFP_KERNEL); + if (!func) { + WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n"); + return; + } + + func->ip = rec->ip; + list_add(&func->list, clear_list); +} + void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) { unsigned long start = (unsigned long)(start_ptr); @@ -6076,8 +6133,12 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) struct dyn_ftrace *rec; struct dyn_ftrace key; struct ftrace_mod_map *mod_map = NULL; + struct ftrace_init_func *func, *func_next; + struct list_head clear_hash; int order; + INIT_LIST_HEAD(&clear_hash); + key.ip = start; key.flags = end; /* overload flags, as it is unsigned long */ @@ -6102,6 +6163,9 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) if (!rec) continue; + /* rec will be cleared from hashes after ftrace_lock unlock */ + add_to_clear_hash_list(&clear_hash, rec); + if (mod_map) save_ftrace_mod_rec(mod_map, rec); @@ -6123,6 +6187,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) goto again; } mutex_unlock(&ftrace_lock); + + list_for_each_entry_safe(func, func_next, &clear_hash, list) { + clear_func_from_hashes(func); + kfree(func); + } } void __init ftrace_free_init_mem(void) -- cgit v1.2.3 From 75cb070960ade40fba5de32138390f3c85c90941 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Oct 2017 19:12:32 -0700 Subject: Revert "net: defer call to cgroup_sk_alloc()" This reverts commit fbb1fb4ad415cb31ce944f65a5ca700aaf73a227. This was not the proper fix, lets cleanly revert it, so that following patch can be carried to stable versions. sock_cgroup_ptr() callers do not expect a NULL return value. Signed-off-by: Eric Dumazet Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: David S. Miller --- kernel/cgroup/cgroup.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3380a3e49af5..44857278eb8a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5709,6 +5709,17 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { -- cgit v1.2.3 From ef8daf8eeb5b8ab6bc356656163d19f20fb827ed Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Mon, 2 Oct 2017 11:56:48 -0400 Subject: livepatch: unpatch all klp_objects if klp_module_coming fails When an incoming module is considered for livepatching by klp_module_coming(), it iterates over multiple patches and multiple kernel objects in this order: list_for_each_entry(patch, &klp_patches, list) { klp_for_each_object(patch, obj) { which means that if one of the kernel objects fails to patch, klp_module_coming()'s error path needs to unpatch and cleanup any kernel objects that were already patched by a previous patch. Reported-by: Miroslav Benes Suggested-by: Petr Mladek Signed-off-by: Joe Lawrence Acked-by: Josh Poimboeuf Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 60 ++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index b9628e43c78f..bf8c8fd72589 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch) } EXPORT_SYMBOL_GPL(klp_register_patch); +/* + * Remove parts of patches that touch a given kernel module. The list of + * patches processed might be limited. When limit is NULL, all patches + * will be handled. + */ +static void klp_cleanup_module_patches_limited(struct module *mod, + struct klp_patch *limit) +{ + struct klp_patch *patch; + struct klp_object *obj; + + list_for_each_entry(patch, &klp_patches, list) { + if (patch == limit) + break; + + klp_for_each_object(patch, obj) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; + + /* + * Only unpatch the module if the patch is enabled or + * is in transition. + */ + if (patch->enabled || patch == klp_transition_patch) { + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_unpatch_object(obj); + } + + klp_free_object_loaded(obj); + break; + } + } +} + int klp_module_coming(struct module *mod) { int ret; @@ -894,7 +929,7 @@ err: pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", patch->mod->name, obj->mod->name, obj->mod->name); mod->klp_alive = false; - klp_free_object_loaded(obj); + klp_cleanup_module_patches_limited(mod, patch); mutex_unlock(&klp_mutex); return ret; @@ -902,9 +937,6 @@ err: void klp_module_going(struct module *mod) { - struct klp_patch *patch; - struct klp_object *obj; - if (WARN_ON(mod->state != MODULE_STATE_GOING && mod->state != MODULE_STATE_COMING)) return; @@ -917,25 +949,7 @@ void klp_module_going(struct module *mod) */ mod->klp_alive = false; - list_for_each_entry(patch, &klp_patches, list) { - klp_for_each_object(patch, obj) { - if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) - continue; - - /* - * Only unpatch the module if the patch is enabled or - * is in transition. - */ - if (patch->enabled || patch == klp_transition_patch) { - pr_notice("reverting patch '%s' on unloading module '%s'\n", - patch->mod->name, obj->mod->name); - klp_unpatch_object(obj); - } - - klp_free_object_loaded(obj); - break; - } - } + klp_cleanup_module_patches_limited(mod, NULL); mutex_unlock(&klp_mutex); } -- cgit v1.2.3 From 952925dec0f276b407b2abce4aee82cba7c700c3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Oct 2017 11:56:23 +0100 Subject: bpf: remove redundant variable old_flags Variable old_flags is being assigned but is never read; it is redundant and can be removed. Cleans up clang warning: Value stored to 'old_flags' is never read Signed-off-by: Colin Ian King Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e88abc0865d5..3db5a17fcfe8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -192,7 +192,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; - u32 old_flags; int err; if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) @@ -239,7 +238,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = prog; } - old_flags = cgrp->bpf.flags[type]; cgrp->bpf.flags[type] = flags; /* allocate and recompute effective prog arrays */ -- cgit v1.2.3 From c5c1ea75a352c3864c4891f36155287a2ed73928 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 13 Jun 2017 13:06:59 +0200 Subject: tracing: Kconfig text fixes for CONFIG_HWLAT_TRACER Trivial spelling fixes for Kconfig help text of config HWLAT_TRACER. Fixes: e7c15cd8a113 ("tracing: Added hardware latency tracer") Signed-off-by: Jesper Dangaard Brouer Acked-by: Steven Rostedt Signed-off-by: Jiri Kosina --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 434c840e2d82..f54b7b6b4a4b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -224,7 +224,7 @@ config HWLAT_TRACER select GENERIC_TRACER help This tracer, when enabled will create one or more kernel threads, - depening on what the cpumask file is set to, which each thread + depending on what the cpumask file is set to, which each thread spinning in a loop looking for interruptions caused by something other than the kernel. For example, if a System Management Interrupt (SMI) takes a noticeable amount of @@ -239,7 +239,7 @@ config HWLAT_TRACER iteration A kernel thread is created that will spin with interrupts disabled - for "width" microseconds in every "widow" cycle. It will not spin + for "width" microseconds in every "window" cycle. It will not spin for "window - width" microseconds, where the system can continue to operate. -- cgit v1.2.3 From af41acf8347dd6d11a2a29a11e2866ca4892d600 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 11 Oct 2017 12:46:47 -0400 Subject: printk: Remove superfluous memory barriers from printk_safe The variable printk_safe_irq_ready is set and never cleared at system boot up, when there's only one CPU active. It is set before other CPUs come on line. Also, it is extremely unlikely that an NMI would trigger this early in boot up (which I wonder why we even have this variable at all). Also mark the printk_safe_irq_ready as read mostly, as it is set at system boot up, and never touched again. Link: http://lkml.kernel.org/r/20171011124647.7781f98f@gandalf.local.home Reviewed-by: Petr Mladek Signed-off-by: Steven Rostedt (VMware) --- kernel/printk/printk_safe.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 3cdaeaef9ce1..724d9292d4b9 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -39,7 +39,7 @@ * There are situations when we want to make sure that all buffers * were handled or when IRQs are blocked. */ -static int printk_safe_irq_ready; +static int printk_safe_irq_ready __read_mostly; #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ sizeof(atomic_t) - \ @@ -63,11 +63,8 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); /* Get flushed in a more safe context. */ static void queue_flush_work(struct printk_safe_seq_buf *s) { - if (printk_safe_irq_ready) { - /* Make sure that IRQ work is really initialized. */ - smp_rmb(); + if (printk_safe_irq_ready) irq_work_queue(&s->work); - } } /* @@ -398,8 +395,12 @@ void __init printk_safe_init(void) #endif } - /* Make sure that IRQ works are initialized before enabling. */ - smp_wmb(); + /* + * In the highly unlikely event that a NMI were to trigger at + * this moment. Make sure IRQ work is set up before this + * variable is set. + */ + barrier(); printk_safe_irq_ready = 1; /* Flush pending messages that did not have scheduled IRQ works. */ -- cgit v1.2.3 From c3b5b6ed1eb4f429addfd9e8e8eb39d1a38c85d0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 16:22:20 +0200 Subject: tracing: mark trace_test_buffer as __maybe_unused After trace_selftest_startup_sched_switch is removed, trace_test_buffer() is only used sometimes, leading to this warning: kernel/trace/trace_selftest.c:62:12: error: 'trace_test_buffer' defined but not used [-Werror=unused-function] There is no simple #ifdef condition that captures well whether the function is in fact used or not, so marking it as __maybe_unused is probably the best way to shut up the warning. The function will then be silently dropped when there is no user. Link: http://lkml.kernel.org/r/20171013142227.1273469-1-arnd@arndb.de Fixes: d8c4deee6dc6 ("tracing: Remove obsolete sched_switch tracer selftest") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 364f78abdf47..eb9ba5c1ba40 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -59,7 +59,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) * Test the trace buffer to see if all the elements * are still sane. */ -static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) +static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count) { unsigned long flags, cnt = 0; int cpu, ret = 0; -- cgit v1.2.3 From 20608924cc2e6bdeaf6f58ccbe9ddfe12dbfa082 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 4 Oct 2017 14:26:26 +0200 Subject: genirq: generic chip: Add irq_gc_mask_disable_and_ack_set() The irq_gc_mask_disable_reg_and_ack() function name implies that it provides the combined functions of irq_gc_mask_disable_reg() and irq_gc_ack(). However, the implementation does not actually do that since it writes the mask instead of the disable register. It also does not maintain the mask cache which makes it inappropriate to use with other masking functions. In addition, commit 659fb32d1b67 ("genirq: replace irq_gc_ack() with {set,clr}_bit variants (fwd)") effectively renamed irq_gc_ack() to irq_gc_ack_set_bit() so this function probably should have also been renamed at that time. The generic chip code currently provides three functions for use with the irq_mask member of the irq_chip structure and two functions for use with the irq_ack member of the irq_chip structure. These functions could be combined into six functions for use with the irq_mask_ack member of the irq_chip structure. However, since only one of the combinations is currently used, only the function irq_gc_mask_disable_and_ack_set() is added by this commit. The '_reg' and '_bit' portions of the base function name were left out of the new combined function name in an attempt to keep the function name length manageable with the 80 character source code line length while still allowing the distinct aspects of each combination to be captured by the name. If other combinations are desired in the future please add them to the irq generic chip library at that time. Signed-off-by: Doug Berger Signed-off-by: Marc Zyngier --- kernel/irq/generic-chip.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 5270a54b9fa4..ec5fe9a0cb05 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -150,6 +150,31 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) irq_gc_unlock(gc); } +/** + * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt + * @d: irq_data + * + * This generic implementation of the irq_mask_ack method is for chips + * with separate enable/disable registers instead of a single mask + * register and where a pending interrupt is acknowledged by setting a + * bit. + * + * Note: This is the only permutation currently used. Similar generic + * functions should be added here if other permutations are required. + */ +void irq_gc_mask_disable_and_ack_set(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; + + irq_gc_lock(gc); + irq_reg_writel(gc, mask, ct->regs.disable); + *ct->mask_cache &= ~mask; + irq_reg_writel(gc, mask, ct->regs.ack); + irq_gc_unlock(gc); +} + /** * irq_gc_eoi - EOI interrupt * @d: irq_data -- cgit v1.2.3 From 0d08af35f16a0cc418ad2afde3bc5f70ace82705 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 4 Oct 2017 14:28:17 +0200 Subject: genirq: generic chip: remove irq_gc_mask_disable_reg_and_ack() Any usage of the irq_gc_mask_disable_reg_and_ack() function has been replaced with the desired functionality. The incorrect and ambiguously named function is removed here to prevent accidental misuse. Signed-off-by: Doug Berger Signed-off-by: Marc Zyngier --- kernel/irq/generic-chip.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ec5fe9a0cb05..c26c5bb6b491 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -134,22 +134,6 @@ void irq_gc_ack_clr_bit(struct irq_data *d) irq_gc_unlock(gc); } -/** - * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt - * @d: irq_data - */ -void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = d->mask; - - irq_gc_lock(gc); - irq_reg_writel(gc, mask, ct->regs.mask); - irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); -} - /** * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt * @d: irq_data -- cgit v1.2.3 From ca182551857cc2c1e6a2b7f1e72090a137a15008 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 13 Oct 2017 15:58:22 -0700 Subject: kmemleak: clear stale pointers from task stacks Kmemleak considers any pointers on task stacks as references. This patch clears newly allocated and reused vmap stacks. Link: http://lkml.kernel.org/r/150728990124.744199.8403409836394318684.stgit@buzz Signed-off-by: Konstantin Khlebnikov Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e702cb9ffbd8..07cc743698d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; +#ifdef CONFIG_DEBUG_KMEMLEAK + /* Clear stale pointers from reused stack. */ + memset(s->addr, 0, THREAD_SIZE); +#endif tsk->stack_vm_area = s; return s->addr; } -- cgit v1.2.3 From 1bdec44955edc22fb840f5965987d2972307dcc9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 12 Oct 2017 10:34:07 -0700 Subject: bpf: verifier: set reg_type on context accesses in second pass Use a simplified is_valid_access() callback when verifier is used for program analysis by non-host JITs. This allows us to teach the verifier about packet start and packet end offsets for direct packet access. We can extend the callback as needed but for most packet processing needs there isn't much more the offloads may require. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cdbcc4f8f6b..9755279d94cb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -813,6 +813,36 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } +static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off, + struct bpf_insn_access_aux *info) +{ + switch (env->prog->type) { + case BPF_PROG_TYPE_XDP: + switch (off) { + case offsetof(struct xdp_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct xdp_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; + case BPF_PROG_TYPE_SCHED_CLS: + switch (off) { + case offsetof(struct sk_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_data_end, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; + default: + return false; + } +} + /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) @@ -821,12 +851,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, .reg_type = *reg_type, }; - /* for analyzer ctx accesses are already validated and converted */ - if (env->analyzer_ops) - return 0; - - if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + if (env->analyzer_ops) { + if (analyzer_is_valid_access(env, off, &info)) { + *reg_type = info.reg_type; + return 0; + } + } else if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower -- cgit v1.2.3 From 9185a610f8f7f1b4e4d28c9de27d1969cf58e0f1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 12 Oct 2017 18:40:02 -0400 Subject: tracing: bpf: Hide bpf trace events when they are not used All the trace events defined in include/trace/events/bpf.h are only used when CONFIG_BPF_SYSCALL is defined. But this file gets included by include/linux/bpf_trace.h which is included by the networking code with CREATE_TRACE_POINTS defined. If a trace event is created but not used it still has data structures and functions created for its use, even though nothing is using them. To not waste space, do not define the BPF trace events in bpf.h unless CONFIG_BPF_SYSCALL is defined. Signed-off-by: Steven Rostedt (VMware) Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 248961af2421..8e7c8bf2b687 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1580,5 +1580,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +/* These are only used within the BPF_SYSCALL code */ +#ifdef CONFIG_BPF_SYSCALL EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); +#endif -- cgit v1.2.3 From 8fd0fbbe8888f295eb34172a7e47bf7d3a0a4687 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Oct 2017 09:45:29 +0200 Subject: perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function") Revert commit: 75e8387685f6 ("perf/ftrace: Fix double traces of perf on ftrace:function") The reason I instantly stumbled on that patch is that it only addresses the ftrace situation and doesn't mention the other _5_ places that use this interface. It doesn't explain why those don't have the problem and if not, why their solution doesn't work for ftrace. It doesn't, but this is just putting more duct tape on. Link: http://lkml.kernel.org/r/20171011080224.200565770@infradead.org Cc: Zhou Chengming Cc: Jiri Olsa Cc: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/events/core.c | 13 ++++--------- kernel/trace/trace_event_perf.c | 4 +--- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_syscalls.c | 4 ++-- kernel/trace/trace_uprobe.c | 2 +- 5 files changed, 10 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..b8db80c5513b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7954,15 +7954,16 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, - rctx, task, NULL); + rctx, task); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task, struct perf_event *event) + struct task_struct *task) { struct perf_sample_data data; + struct perf_event *event; struct perf_raw_record raw = { .frag = { @@ -7976,15 +7977,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); - /* Use the given event instead of the hlist */ - if (event) { + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); - } else { - hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } } /* diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 13ba2d3f6a91..562fa69df5d3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -306,7 +306,6 @@ static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *pt_regs) { - struct perf_event *event; struct ftrace_entry *entry; struct hlist_head *head; struct pt_regs regs; @@ -330,9 +329,8 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; - event = container_of(ops, struct perf_event, ftrace_ops); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, - 1, ®s, head, NULL, event); + 1, ®s, head, NULL); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index af6134f2e597..996902a526d4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL, NULL); + head, NULL); } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL, NULL); + head, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 696afe72d3b1..934b0da72679 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -622,7 +622,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, - head, NULL, NULL); + head, NULL); } static int perf_sysenter_enable(struct trace_event_call *call) @@ -716,7 +716,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) } perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, - 1, regs, head, NULL, NULL); + 1, regs, head, NULL); } static int perf_sysexit_enable(struct trace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b34965e62fb9..402120ba4594 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, } perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL, NULL); + head, NULL); out: preempt_enable(); } -- cgit v1.2.3 From 466c81c45b650deca213fda3d0ec4761667379a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Oct 2017 17:15:47 +0200 Subject: perf/ftrace: Fix function trace events The function-trace <-> perf interface is a tad messed up. Where all the other trace <-> perf interfaces use a single trace hook registration and use per-cpu RCU based hlist to iterate the events, function-trace actually needs multiple hook registrations in order to minimize function entry patching when filters are present. The end result is that we iterate events both on the trace hook and on the hlist, which results in reporting events multiple times. Since function-trace cannot use the regular scheme, fix it the other way around, use singleton hlists. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 80 +++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 562fa69df5d3..e73f9ab15939 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_event *p_event) int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; - struct hlist_head __percpu *pcpu_list; - struct hlist_head *list; - - pcpu_list = tp_event->perf_events; - if (WARN_ON_ONCE(!pcpu_list)) - return -EINVAL; if (!(flags & PERF_EF_START)) p_event->hw.state = PERF_HES_STOPPED; - list = this_cpu_ptr(pcpu_list); - hlist_add_head_rcu(&p_event->hlist_entry, list); + /* + * If TRACE_REG_PERF_ADD returns false; no custom action was performed + * and we need to take the default action of enqueueing our event on + * the right per-cpu hlist. + */ + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) { + struct hlist_head __percpu *pcpu_list; + struct hlist_head *list; + + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) + return -EINVAL; + + list = this_cpu_ptr(pcpu_list); + hlist_add_head_rcu(&p_event->hlist_entry, list); + } - return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); + return 0; } void perf_trace_del(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; - hlist_del_rcu(&p_event->hlist_entry); - tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); + + /* + * If TRACE_REG_PERF_DEL returns false; no custom action was performed + * and we need to take the default action of dequeueing our event from + * the right per-cpu hlist. + */ + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event)) + hlist_del_rcu(&p_event->hlist_entry); } void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) @@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *pt_regs) { struct ftrace_entry *entry; - struct hlist_head *head; + struct perf_event *event; + struct hlist_head head; struct pt_regs regs; int rctx; - head = this_cpu_ptr(event_function.perf_events); - if (hlist_empty(head)) + if ((unsigned long)ops->private != smp_processor_id()) return; + event = container_of(ops, struct perf_event, ftrace_ops); + + /* + * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all + * the perf code does is hlist_for_each_entry_rcu(), so we can + * get away with simply setting the @head.first pointer in order + * to create a singular list. + */ + head.first = &event->hlist_entry; + #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ sizeof(u64)) - sizeof(u32)) @@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, - 1, ®s, head, NULL); + 1, ®s, &head, NULL); #undef ENTRY_SIZE } @@ -339,8 +363,10 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU; - ops->func = perf_ftrace_function_call; + ops->flags |= FTRACE_OPS_FL_RCU; + ops->func = perf_ftrace_function_call; + ops->private = (void *)(unsigned long)nr_cpu_ids; + return register_ftrace_function(ops); } @@ -352,19 +378,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event) return ret; } -static void perf_ftrace_function_enable(struct perf_event *event) -{ - ftrace_function_local_enable(&event->ftrace_ops); -} - -static void perf_ftrace_function_disable(struct perf_event *event) -{ - ftrace_function_local_disable(&event->ftrace_ops); -} - int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data) { + struct perf_event *event = data; + switch (type) { case TRACE_REG_REGISTER: case TRACE_REG_UNREGISTER: @@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct trace_event_call *call, case TRACE_REG_PERF_CLOSE: return perf_ftrace_function_unregister(data); case TRACE_REG_PERF_ADD: - perf_ftrace_function_enable(data); - return 0; + event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id(); + return 1; case TRACE_REG_PERF_DEL: - perf_ftrace_function_disable(data); - return 0; + event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids; + return 1; } return -EINVAL; -- cgit v1.2.3 From 1dd311e6dcda4020c603bcf9f390a577d439d509 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Oct 2017 09:45:31 +0200 Subject: perf/ftrace: Small cleanup ops->flags _should_ be 0 at this point, so setting the flag using bitwise or is a bit daft. Link: http://lkml.kernel.org/r/20171011080224.315585202@infradead.org Requested-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e73f9ab15939..55d6dff37daf 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -363,7 +363,7 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags |= FTRACE_OPS_FL_RCU; + ops->flags = FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; ops->private = (void *)(unsigned long)nr_cpu_ids; -- cgit v1.2.3 From b3a88803ac5b4bda26017b485c8722a8487fefb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Oct 2017 09:45:32 +0200 Subject: ftrace: Kill FTRACE_OPS_FL_PER_CPU The one and only user of FTRACE_OPS_FL_PER_CPU is gone, remove the lot. Link: http://lkml.kernel.org/r/20171011080224.372422809@infradead.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 55 ++++++--------------------------------------------- 1 file changed, 6 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e0a98225666b..2fd3edaec6de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -203,30 +203,6 @@ void clear_ftrace_function(void) ftrace_trace_function = ftrace_stub; } -static void per_cpu_ops_disable_all(struct ftrace_ops *ops) -{ - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(ops->disabled, cpu) = 1; -} - -static int per_cpu_ops_alloc(struct ftrace_ops *ops) -{ - int __percpu *disabled; - - if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU))) - return -EINVAL; - - disabled = alloc_percpu(int); - if (!disabled) - return -ENOMEM; - - ops->disabled = disabled; - per_cpu_ops_disable_all(ops); - return 0; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) * If this is a dynamic, RCU, or per CPU ops, or we force list func, * then it needs to call the list anyway. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU | - FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC) + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) || + FTRACE_FORCE_LIST_FUNC) return ftrace_ops_list_func; return ftrace_ops_get_func(ops); @@ -422,11 +398,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_PER_CPU) { - if (per_cpu_ops_alloc(ops)) - return -ENOMEM; - } - add_ftrace_ops(&ftrace_ops_list, ops); /* Always save the function, and reset at unregistering */ @@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } -static void per_cpu_ops_free(struct ftrace_ops *ops) -{ - free_percpu(ops->disabled); -} - static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) goto free_ops; return 0; @@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * The same goes for freeing the per_cpu data of the per_cpu * ops. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) { /* * We need to do a hard force of sched synchronization. * This is because we use preempt_disable() to do RCU, but @@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) free_ops: arch_ftrace_trampoline_free(ops); - - if (ops->flags & FTRACE_OPS_FL_PER_CPU) - per_cpu_ops_free(ops); } return 0; @@ -6355,10 +6318,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * If any of the above fails then the op->func() is not executed. */ if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) && - (!(op->flags & FTRACE_OPS_FL_PER_CPU) || - !ftrace_function_local_disabled(op)) && ftrace_ops_test(op, ip, regs)) { - if (FTRACE_WARN_ON(!op->func)) { pr_warn("op=%p %pS\n", op, op); goto out; @@ -6416,10 +6376,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); - if (!(op->flags & FTRACE_OPS_FL_PER_CPU) || - !ftrace_function_local_disabled(op)) { - op->func(ip, parent_ip, op, regs); - } + op->func(ip, parent_ip, op, regs); preempt_enable_notrace(); trace_clear_recursion(bit); @@ -6443,7 +6400,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) * or does per cpu logic, then we need to call the assist handler. */ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) || - ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU)) + ops->flags & FTRACE_OPS_FL_RCU) return ftrace_ops_assist_func; return ops->func; -- cgit v1.2.3 From fe460423438b62eb7440d994ab19a9f444e6280d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 20:29:38 +0200 Subject: posix-stubs: Use get_timespec64() and put_timespec64() This is a follow-up to commit 5c4994102fb5 ("posix-timers: Use get_timespec64() and put_timespec64()"), which left two system call using copy_from_user()/copy_to_user(). Change them as well for consistency. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Acked-by: Nicolas Pitre Cc: y2038@lists.linaro.org Cc: John Stultz Cc: Al Viro Cc: Deepa Dinamani Link: https://lkml.kernel.org/r/20171013183009.3442318-1-arnd@arndb.de --- kernel/time/posix-stubs.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 06f34feb635e..b258bee13b02 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -117,8 +117,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct timespec64 t64; - struct timespec t; + struct timespec64 t; switch (which_clock) { case CLOCK_REALTIME: @@ -129,16 +128,15 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; } - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + if (get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -203,8 +201,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) { - struct timespec64 t64; - struct timespec t; + struct timespec64 t; switch (which_clock) { case CLOCK_REALTIME: @@ -215,16 +212,15 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; } - if (compat_get_timespec(&t, rqtp)) + if (compat_get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } -- cgit v1.2.3 From 4eb1bca1793385b8caff4b2e1f19b31a013dd1ec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 20:34:35 +0200 Subject: time: Use do_settimeofday64() internally do_settimeofday() is a wrapper around do_settimeofday64(), so that function can be called directly. The wrapper can be removed once the last user is gone. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: y2038@lists.linaro.org Cc: Frederic Weisbecker Cc: Stephen Boyd Cc: John Stultz Cc: Al Viro Cc: Deepa Dinamani Link: https://lkml.kernel.org/r/20171013183452.3635956-1-arnd@arndb.de --- kernel/time/time.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 44a8c1402133..cfe3d3e4679f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -82,7 +82,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc) SYSCALL_DEFINE1(stime, time_t __user *, tptr) { - struct timespec tv; + struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) @@ -90,11 +90,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) tv.tv_nsec = 0; - err = security_settime(&tv, NULL); + err = security_settime64(&tv, NULL); if (err) return err; - do_settimeofday(&tv); + do_settimeofday64(&tv); return 0; } @@ -122,7 +122,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) { - struct timespec tv; + struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) @@ -130,11 +130,11 @@ COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) tv.tv_nsec = 0; - err = security_settime(&tv, NULL); + err = security_settime64(&tv, NULL); if (err) return err; - do_settimeofday(&tv); + do_settimeofday64(&tv); return 0; } -- cgit v1.2.3 From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:28 +0200 Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP The 'cpumap' is primarily used as a backend map for XDP BPF helper call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. This patch implement the main part of the map. It is not connected to the XDP redirect system yet, and no SKB allocation are done yet. The main concern in this patch is to ensure the datapath can run without any locking. This adds complexity to the setup and tear-down procedure, which assumptions are extra carefully documented in the code comments. V2: - make sure array isn't larger than NR_CPUS - make sure CPUs added is a valid possible CPU V3: fix nitpicks from Jakub Kicinski V5: - Restrict map allocation to root / CAP_SYS_ADMIN - WARN_ON_ONCE if queue is not empty on tear-down - Return -EPERM on memlock limit instead of -ENOMEM - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup() - Moved cpu_map_enqueue() to next patch V6: all notice by Daniel Borkmann - Fix err return code in cpu_map_alloc() introduced in V5 - Move cpu_possible() check after max_entries boundary check - Forbid usage initially in check_map_func_compatibility() V7: - Fix alloc error path spotted by Daniel Borkmann - Did stress test adding+removing CPUs from the map concurrently - Fixed refcnt issue on cpu_map_entry, kthread started too soon - Make sure packets are flushed during tear-down, involved use of rcu_barrier() and kthread_run only exit after queue is empty - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring V8: - Nitpicking comments and gramma by Edward Cree - Fix missing semi-colon introduced in V7 due to rebasing - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch Signed-off-by: Jesper Dangaard Brouer Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 1 + kernel/bpf/cpumap.c | 560 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 8 +- kernel/bpf/verifier.c | 5 + 4 files changed, 573 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/cpumap.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 53fb09f92e3f..e597daae6120 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o +obj-$(CONFIG_BPF_SYSCALL) += cpumap.o ifeq ($(CONFIG_STREAM_PARSER),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c new file mode 100644 index 000000000000..e1e25ddba038 --- /dev/null +++ b/kernel/bpf/cpumap.c @@ -0,0 +1,560 @@ +/* bpf/cpumap.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ + +/* The 'cpumap' is primarily used as a backend map for XDP BPF helper + * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. + * + * Unlike devmap which redirects XDP frames out another NIC device, + * this map type redirects raw XDP frames to another CPU. The remote + * CPU will do SKB-allocation and call the normal network stack. + * + * This is a scalability and isolation mechanism, that allow + * separating the early driver network XDP layer, from the rest of the + * netstack, and assigning dedicated CPUs for this stage. This + * basically allows for 10G wirespeed pre-filtering via bpf. + */ +#include +#include +#include + +#include +#include +#include +#include + +/* General idea: XDP packets getting XDP redirected to another CPU, + * will maximum be stored/queued for one driver ->poll() call. It is + * guaranteed that setting flush bit and flush operation happen on + * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() + * which queue in bpf_cpu_map_entry contains packets. + */ + +#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct xdp_bulk_queue { + void *q[CPU_MAP_BULK_SIZE]; + unsigned int count; +}; + +/* Struct for every remote "destination" CPU in map */ +struct bpf_cpu_map_entry { + u32 qsize; /* Queue size placeholder for map lookup */ + + /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ + struct xdp_bulk_queue __percpu *bulkq; + + /* Queue with potential multi-producers, and single-consumer kthread */ + struct ptr_ring *queue; + struct task_struct *kthread; + struct work_struct kthread_stop_wq; + + atomic_t refcnt; /* Control when this struct can be free'ed */ + struct rcu_head rcu; +}; + +struct bpf_cpu_map { + struct bpf_map map; + /* Below members specific for map type */ + struct bpf_cpu_map_entry **cpu_map; + unsigned long __percpu *flush_needed; +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq); + +static u64 cpu_map_bitmap_size(const union bpf_attr *attr) +{ + return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +} + +static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) +{ + struct bpf_cpu_map *cmap; + int err = -ENOMEM; + u64 cost; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + return ERR_PTR(-EINVAL); + + cmap = kzalloc(sizeof(*cmap), GFP_USER); + if (!cmap) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + cmap->map.map_type = attr->map_type; + cmap->map.key_size = attr->key_size; + cmap->map.value_size = attr->value_size; + cmap->map.max_entries = attr->max_entries; + cmap->map.map_flags = attr->map_flags; + cmap->map.numa_node = bpf_map_attr_numa_node(attr); + + /* Pre-limit array size based on NR_CPUS, not final CPU check */ + if (cmap->map.max_entries > NR_CPUS) { + err = -E2BIG; + goto free_cmap; + } + + /* make sure page count doesn't overflow */ + cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); + cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_cmap; + cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + ret = bpf_map_precharge_memlock(cmap->map.pages); + if (ret) { + err = ret; + goto free_cmap; + } + + /* A per cpu bitfield with a bit per possible CPU in map */ + cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), + __alignof__(unsigned long)); + if (!cmap->flush_needed) + goto free_cmap; + + /* Alloc array for possible remote "destination" CPUs */ + cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * + sizeof(struct bpf_cpu_map_entry *), + cmap->map.numa_node); + if (!cmap->cpu_map) + goto free_percpu; + + return &cmap->map; +free_percpu: + free_percpu(cmap->flush_needed); +free_cmap: + kfree(cmap); + return ERR_PTR(err); +} + +void __cpu_map_queue_destructor(void *ptr) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + if (WARN_ON_ONCE(ptr)) + page_frag_free(ptr); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); + kfree(rcpu->queue); + kfree(rcpu); + } +} + +static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + atomic_inc(&rcpu->refcnt); +} + +/* called from workqueue, to workaround syscall using preempt_disable */ +static void cpu_map_kthread_stop(struct work_struct *work) +{ + struct bpf_cpu_map_entry *rcpu; + + rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); + + /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, + * as it waits until all in-flight call_rcu() callbacks complete. + */ + rcu_barrier(); + + /* kthread_stop will wake_up_process and wait for it to complete */ + kthread_stop(rcpu->kthread); +} + +static int cpu_map_kthread_run(void *data) +{ + struct bpf_cpu_map_entry *rcpu = data; + + set_current_state(TASK_INTERRUPTIBLE); + + /* When kthread gives stop order, then rcpu have been disconnected + * from map, thus no new packets can enter. Remaining in-flight + * per CPU stored packets are flushed to this queue. Wait honoring + * kthread_stop signal until queue is empty. + */ + while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_pkt *xdp_pkt; + + schedule(); + /* Do work */ + while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) { + /* For now just "refcnt-free" */ + page_frag_free(xdp_pkt); + } + __set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + put_cpu_map_entry(rcpu); + return 0; +} + +struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +{ + gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; + struct bpf_cpu_map_entry *rcpu; + int numa, err; + + /* Have map->numa_node, but choose node of redirect target CPU */ + numa = cpu_to_node(cpu); + + rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); + if (!rcpu) + return NULL; + + /* Alloc percpu bulkq */ + rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), + sizeof(void *), gfp); + if (!rcpu->bulkq) + goto free_rcu; + + /* Alloc queue */ + rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); + if (!rcpu->queue) + goto free_bulkq; + + err = ptr_ring_init(rcpu->queue, qsize, gfp); + if (err) + goto free_queue; + + rcpu->qsize = qsize; + + /* Setup kthread */ + rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, + "cpumap/%d/map:%d", cpu, map_id); + if (IS_ERR(rcpu->kthread)) + goto free_ptr_ring; + + get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ + get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + + /* Make sure kthread runs on a single CPU */ + kthread_bind(rcpu->kthread, cpu); + wake_up_process(rcpu->kthread); + + return rcpu; + +free_ptr_ring: + ptr_ring_cleanup(rcpu->queue, NULL); +free_queue: + kfree(rcpu->queue); +free_bulkq: + free_percpu(rcpu->bulkq); +free_rcu: + kfree(rcpu); + return NULL; +} + +void __cpu_map_entry_free(struct rcu_head *rcu) +{ + struct bpf_cpu_map_entry *rcpu; + int cpu; + + /* This cpu_map_entry have been disconnected from map and one + * RCU graze-period have elapsed. Thus, XDP cannot queue any + * new packets and cannot change/set flush_needed that can + * find this entry. + */ + rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); + + /* Flush remaining packets in percpu bulkq */ + for_each_online_cpu(cpu) { + struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); + + /* No concurrent bq_enqueue can run at this point */ + bq_flush_to_queue(rcpu, bq); + } + free_percpu(rcpu->bulkq); + /* Cannot kthread_stop() here, last put free rcpu resources */ + put_cpu_map_entry(rcpu); +} + +/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to + * ensure any driver rcu critical sections have completed, but this + * does not guarantee a flush has happened yet. Because driver side + * rcu_read_lock/unlock only protects the running XDP program. The + * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a + * pending flush op doesn't fail. + * + * The bpf_cpu_map_entry is still used by the kthread, and there can + * still be pending packets (in queue and percpu bulkq). A refcnt + * makes sure to last user (kthread_stop vs. call_rcu) free memory + * resources. + * + * The rcu callback __cpu_map_entry_free flush remaining packets in + * percpu bulkq to queue. Due to caller map_delete_elem() disable + * preemption, cannot call kthread_stop() to make sure queue is empty. + * Instead a work_queue is started for stopping kthread, + * cpu_map_kthread_stop, which waits for an RCU graze period before + * stopping kthread, emptying the queue. + */ +void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +{ + struct bpf_cpu_map_entry *old_rcpu; + + old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); + if (old_rcpu) { + call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); + INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); + schedule_work(&old_rcpu->kthread_stop_wq); + } +} + +int cpu_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 key_cpu = *(u32 *)key; + + if (key_cpu >= map->max_entries) + return -EINVAL; + + /* notice caller map_delete_elem() use preempt_disable() */ + __cpu_map_entry_replace(cmap, key_cpu, NULL); + return 0; +} + +int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + /* Array index key correspond to CPU number */ + u32 key_cpu = *(u32 *)key; + /* Value is the queue size */ + u32 qsize = *(u32 *)value; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(key_cpu >= cmap->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + return -EOVERFLOW; + + /* Make sure CPU is a valid possible cpu */ + if (!cpu_possible(key_cpu)) + return -ENODEV; + + if (qsize == 0) { + rcpu = NULL; /* Same as deleting */ + } else { + /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ + rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + if (!rcpu) + return -ENOMEM; + } + rcu_read_lock(); + __cpu_map_entry_replace(cmap, key_cpu, rcpu); + rcu_read_unlock(); + return 0; +} + +void cpu_map_free(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + int cpu; + u32 i; + + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the bpf programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete. The rcu critical section only guarantees + * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. + * It does __not__ ensure pending flush operations (if any) are + * complete. + */ + synchronize_rcu(); + + /* To ensure all pending flush operations have completed wait for flush + * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * Because the above synchronize_rcu() ensures the map is disconnected + * from the program we can assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + + while (!bitmap_empty(bitmap, cmap->map.max_entries)) + cond_resched(); + } + + /* For cpu_map the remote CPUs can still be using the entries + * (struct bpf_cpu_map_entry). + */ + for (i = 0; i < cmap->map.max_entries; i++) { + struct bpf_cpu_map_entry *rcpu; + + rcpu = READ_ONCE(cmap->cpu_map[i]); + if (!rcpu) + continue; + + /* bq flush and cleanup happens after RCU graze-period */ + __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ + } + free_percpu(cmap->flush_needed); + bpf_map_area_free(cmap->cpu_map); + kfree(cmap); +} + +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + if (key >= map->max_entries) + return NULL; + + rcpu = READ_ONCE(cmap->cpu_map[key]); + return rcpu; +} + +static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map_entry *rcpu = + __cpu_map_lookup_elem(map, *(u32 *)key); + + return rcpu ? &rcpu->qsize : NULL; +} + +static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= cmap->map.max_entries) { + *next = 0; + return 0; + } + + if (index == cmap->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +const struct bpf_map_ops cpu_map_ops = { + .map_alloc = cpu_map_alloc, + .map_free = cpu_map_free, + .map_delete_elem = cpu_map_delete_elem, + .map_update_elem = cpu_map_update_elem, + .map_lookup_elem = cpu_map_lookup_elem, + .map_get_next_key = cpu_map_get_next_key, +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq) +{ + struct ptr_ring *q; + int i; + + if (unlikely(!bq->count)) + return 0; + + q = rcpu->queue; + spin_lock(&q->producer_lock); + + for (i = 0; i < bq->count; i++) { + void *xdp_pkt = bq->q[i]; + int err; + + err = __ptr_ring_produce(q, xdp_pkt); + if (err) { + /* Free xdp_pkt */ + page_frag_free(xdp_pkt); + } + } + bq->count = 0; + spin_unlock(&q->producer_lock); + + return 0; +} + +/* Notice: Will change in later patch */ +struct xdp_pkt { + void *data; + u16 len; + u16 headroom; +}; + +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); + + if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) + bq_flush_to_queue(rcpu, bq); + + /* Notice, xdp_buff/page MUST be queued here, long enough for + * driver to code invoking us to finished, due to driver + * (e.g. ixgbe) recycle tricks based on page-refcnt. + * + * Thus, incoming xdp_pkt is always queued here (else we race + * with another CPU on page-refcnt and remaining driver code). + * Queue time is very short, as driver will invoke flush + * operation, when completing napi->poll call. + */ + bq->q[bq->count++] = xdp_pkt; + return 0; +} + +void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + + __set_bit(bit, bitmap); +} + +void __cpu_map_flush(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + u32 bit; + + /* The napi->poll softirq makes sure __cpu_map_insert_ctx() + * and __cpu_map_flush() happen on same CPU. Thus, the percpu + * bitmap indicate which percpu bulkq have packets. + */ + for_each_set_bit(bit, bitmap, map->max_entries) { + struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); + struct xdp_bulk_queue *bq; + + /* This is possible if entry is removed by user space + * between xdp redirect and flush op. + */ + if (unlikely(!rcpu)) + continue; + + __clear_bit(bit, bitmap); + + /* Flush all frames in bulkq to real queue */ + bq = this_cpu_ptr(rcpu->bulkq); + bq_flush_to_queue(rcpu, bq); + + /* If already running, costs spin_lock_irqsave + smb_mb */ + wake_up_process(rcpu->kthread); + } +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d124e702e040..54fba06942f5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -592,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; + /* Need to create a kthread, thus must support schedule */ + if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + err = map->ops->map_update_elem(map, key, value, attr->flags); + goto out; + } + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from * inside bpf map update or delete otherwise deadlocks are possible */ @@ -622,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr) } __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_update_elem(map, ufd, key, value); free_value: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9755279d94cb..cefa64be9a2f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1444,6 +1444,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; + /* Restrict bpf side of cpumap, open when use-cases appear */ + case BPF_MAP_TYPE_CPUMAP: + if (func_id != BPF_FUNC_redirect_map) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) -- cgit v1.2.3 From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:34 +0200 Subject: bpf: XDP_REDIRECT enable use of cpumap This patch connects cpumap to the xdp_do_redirect_map infrastructure. Still no SKB allocation are done yet. The XDP frames are transferred to the other CPU, but they are simply refcnt decremented on the remote CPU. This served as a good benchmark for measuring the overhead of remote refcnt decrement. If driver page recycle cache is not efficient then this, exposes a bottleneck in the page allocator. A shout-out to MST's ptr_ring, which is the secret behind is being so efficient to transfer memory pointers between CPUs, without constantly bouncing cache-lines between CPUs. V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot. V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet, as implementation require a separate upstream discussion. V5: - Fix a maybe-uninitialized pointed out by kbuild test robot. - Restrict bpf-prog side access to cpumap, open when use-cases appear - Implement cpu_map_enqueue() as a more simple void pointer enqueue V6: - Allow cpumap type for usage in helper bpf_redirect_map, general bpf-prog side restriction moved to earlier patch. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 22 +++++++++++++++++++++- kernel/bpf/verifier.c | 3 ++- 2 files changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e1e25ddba038..768da6a2c265 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -500,7 +500,7 @@ struct xdp_pkt { /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -520,6 +520,26 @@ int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) return 0; } +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct xdp_pkt *xdp_pkt; + int headroom; + + /* For now this is just used as a void pointer to data_hard_start. + * Followup patch will generalize this. + */ + xdp_pkt = xdp->data_hard_start; + + /* Fake writing into xdp_pkt->data to measure overhead */ + headroom = xdp->data - xdp->data_hard_start; + if (headroom < sizeof(*xdp_pkt)) + xdp_pkt->data = xdp->data; + + bq_enqueue(rcpu, xdp_pkt); + return 0; +} + void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cefa64be9a2f..e4d5136725a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1486,7 +1486,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_redirect_map: - if (map->map_type != BPF_MAP_TYPE_DEVMAP) + if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_CPUMAP) goto error; break; case BPF_FUNC_sk_redirect_map: -- cgit v1.2.3 From 1c601d829ab0d7ac3ac44853f83db2206afe67fc Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:39 +0200 Subject: bpf: cpumap xdp_buff to skb conversion and allocation This patch makes cpumap functional, by adding SKB allocation and invoking the network stack on the dequeuing CPU. For constructing the SKB on the remote CPU, the xdp_buff in converted into a struct xdp_pkt, and it mapped into the top headroom of the packet, to avoid allocating separate mem. For now, struct xdp_pkt is just a cpumap internal data structure, with info carried between enqueue to dequeue. If a driver doesn't have enough headroom it is simply dropped, with return code -EOVERFLOW. This will be picked up the xdp tracepoint infrastructure, to allow users to catch this. V2: take into account xdp->data_meta V4: - Drop busypoll tricks, keeping it more simple. - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei V5: correct RCU read protection around __netif_receive_skb_core. V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 152 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 130 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 768da6a2c265..ee7adf4352dd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -25,6 +25,9 @@ #include #include +#include /* netif_receive_skb_core */ +#include /* eth_type_trans */ + /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is * guaranteed that setting flush bit and flush operation happen on @@ -179,6 +182,92 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } +/* For now, xdp_pkt is a cpumap internal data structure, with info + * carried between enqueue to dequeue. It is mapped into the top + * headroom of the packet, to avoid allocating separate mem. + */ +struct xdp_pkt { + void *data; + u16 len; + u16 headroom; + u16 metasize; + struct net_device *dev_rx; +}; + +/* Convert xdp_buff to xdp_pkt */ +static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) +{ + struct xdp_pkt *xdp_pkt; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if ((headroom - metasize) < sizeof(*xdp_pkt)) + return NULL; + + /* Store info in top of packet */ + xdp_pkt = xdp->data_hard_start; + + xdp_pkt->data = xdp->data; + xdp_pkt->len = xdp->data_end - xdp->data; + xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); + xdp_pkt->metasize = metasize; + + return xdp_pkt; +} + +struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) +{ + unsigned int frame_size; + void *pkt_data_start; + struct sk_buff *skb; + + /* build_skb need to place skb_shared_info after SKB end, and + * also want to know the memory "truesize". Thus, need to + * know the memory frame size backing xdp_buff. + * + * XDP was designed to have PAGE_SIZE frames, but this + * assumption is not longer true with ixgbe and i40e. It + * would be preferred to set frame_size to 2048 or 4096 + * depending on the driver. + * frame_size = 2048; + * frame_len = frame_size - sizeof(*xdp_pkt); + * + * Instead, with info avail, skb_shared_info in placed after + * packet len. This, unfortunately fakes the truesize. + * Another disadvantage of this approach, the skb_shared_info + * is not at a fixed memory location, with mixed length + * packets, which is bad for cache-line hotness. + */ + frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + skb = build_skb(pkt_data_start, frame_size); + if (!skb) + return NULL; + + skb_reserve(skb, xdp_pkt->headroom); + __skb_put(skb, xdp_pkt->len); + if (xdp_pkt->metasize) + skb_metadata_set(skb, xdp_pkt->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + return skb; +} + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -191,15 +280,45 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + unsigned int processed = 0, drops = 0; struct xdp_pkt *xdp_pkt; - schedule(); - /* Do work */ - while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) { - /* For now just "refcnt-free" */ - page_frag_free(xdp_pkt); + /* Release CPU reschedule checks */ + if (__ptr_ring_empty(rcpu->queue)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } else { + cond_resched(); + } + __set_current_state(TASK_RUNNING); + + /* Process packets in rcpu->queue */ + local_bh_disable(); + /* + * The bpf_cpu_map_entry is single consumer, with this + * kthread CPU pinned. Lockless access to ptr_ring + * consume side valid as no-resize allowed of queue. + */ + while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + struct sk_buff *skb; + int ret; + + skb = cpu_map_build_skb(rcpu, xdp_pkt); + if (!skb) { + page_frag_free(xdp_pkt); + continue; + } + + /* Inject into network stack */ + ret = netif_receive_skb_core(skb); + if (ret == NET_RX_DROP) + drops++; + + /* Limit BH-disable period */ + if (++processed == 8) + break; } - __set_current_state(TASK_INTERRUPTIBLE); + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -490,13 +609,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, return 0; } -/* Notice: Will change in later patch */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; -}; - /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ @@ -524,17 +636,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) { struct xdp_pkt *xdp_pkt; - int headroom; - /* For now this is just used as a void pointer to data_hard_start. - * Followup patch will generalize this. - */ - xdp_pkt = xdp->data_hard_start; + xdp_pkt = convert_to_xdp_pkt(xdp); + if (!xdp_pkt) + return -EOVERFLOW; - /* Fake writing into xdp_pkt->data to measure overhead */ - headroom = xdp->data - xdp->data_hard_start; - if (headroom < sizeof(*xdp_pkt)) - xdp_pkt->data = xdp->data; + /* Info needed when constructing SKB on remote CPU */ + xdp_pkt->dev_rx = dev_rx; bq_enqueue(rcpu, xdp_pkt); return 0; -- cgit v1.2.3 From f9419f7bd7a5318b636a941a0214c5cdfa6f6530 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:44 +0200 Subject: bpf: cpumap add tracepoints This adds two tracepoint to the cpumap. One for the enqueue side trace_xdp_cpumap_enqueue() and one for the kthread dequeue side trace_xdp_cpumap_kthread(). To mitigate the tracepoint overhead, these are invoked during the enqueue/dequeue bulking phases, thus amortizing the cost. The obvious use-cases are for debugging and monitoring. The non-intuitive use-case is using these as a feedback loop to know the system load. One can imagine auto-scaling by reducing, adding or activating more worker CPUs on demand. V4: tracepoint remove time_limit info, instead add sched info V8: intro struct bpf_cpu_map_entry members cpu+map_id in this patch Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ee7adf4352dd..b4358d84ddf1 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* netif_receive_skb_core */ #include /* eth_type_trans */ @@ -43,6 +44,8 @@ struct xdp_bulk_queue { /* Struct for every remote "destination" CPU in map */ struct bpf_cpu_map_entry { + u32 cpu; /* kthread CPU and map index */ + int map_id; /* Back reference to map */ u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ @@ -280,15 +283,16 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - unsigned int processed = 0, drops = 0; + unsigned int processed = 0, drops = 0, sched = 0; struct xdp_pkt *xdp_pkt; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); + sched = 1; } else { - cond_resched(); + sched = cond_resched(); } __set_current_state(TASK_RUNNING); @@ -318,6 +322,9 @@ static int cpu_map_kthread_run(void *data) if (++processed == 8) break; } + /* Feedback loop via tracepoint */ + trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -354,7 +361,9 @@ struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) if (err) goto free_queue; - rcpu->qsize = qsize; + rcpu->cpu = cpu; + rcpu->map_id = map_id; + rcpu->qsize = qsize; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, @@ -584,6 +593,8 @@ const struct bpf_map_ops cpu_map_ops = { static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, struct xdp_bulk_queue *bq) { + unsigned int processed = 0, drops = 0; + const int to_cpu = rcpu->cpu; struct ptr_ring *q; int i; @@ -599,13 +610,16 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdp_pkt); if (err) { - /* Free xdp_pkt */ - page_frag_free(xdp_pkt); + drops++; + page_frag_free(xdp_pkt); /* Free xdp_pkt */ } + processed++; } bq->count = 0; spin_unlock(&q->producer_lock); + /* Feedback loop via tracepoints */ + trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; } -- cgit v1.2.3 From 5ffeb0501c6b36d080de78372fdb70b404b91e9d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 25 Jul 2016 16:07:10 +0100 Subject: genirq: export irq_get_percpu_devid_partition to modules Any modular driver using cluster-affine PPIs needs to be able to call irq_get_percpu_devid_partition so that it can enable the IRQ on the correct subset of CPUs. This patch exports the symbol so that it can be called from within a module. Acked-by: Marc Zyngier Acked-by: Thomas Gleixner Signed-off-by: Will Deacon --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 82afb7ed369f..694c1a9d6485 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -863,6 +863,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) return 0; } +EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); void kstat_incr_irq_this_cpu(unsigned int irq) { -- cgit v1.2.3 From bc1d202023eb66f088f736ba423bee1cf135c720 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 16 Aug 2016 16:53:15 +0100 Subject: perf/core: Export AUX buffer helpers to modules Perf PMU drivers using AUX buffers cannot be built as modules unless the AUX helpers are exported. This patch exports perf_aux_output_{begin,end,skip} and perf_get_aux to modules. Cc: Peter Zijlstra Signed-off-by: Will Deacon --- kernel/events/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index f684d8e5fa2b..6d9bffe4d6cc 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -411,6 +411,7 @@ err: return NULL; } +EXPORT_SYMBOL_GPL(perf_aux_output_begin); static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) { @@ -480,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) rb_free_aux(rb); ring_buffer_put(rb); } +EXPORT_SYMBOL_GPL(perf_aux_output_end); /* * Skip over a given number of bytes in the AUX buffer, due to, for example, @@ -505,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) return 0; } +EXPORT_SYMBOL_GPL(perf_aux_output_skip); void *perf_get_aux(struct perf_output_handle *handle) { @@ -514,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle) return handle->rb->aux_priv; } +EXPORT_SYMBOL_GPL(perf_get_aux); #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) -- cgit v1.2.3 From 28e33f9d78eefe98ea86673ab31e988b37a9a738 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 11:16:55 -0700 Subject: bpf: disallow arithmetic operations on context pointer Commit f1174f77b50c ("bpf/verifier: rework value tracking") removed the crafty selection of which pointer types are allowed to be modified. This is OK for most pointer types since adjust_ptr_min_max_vals() will catch operations on immutable pointers. One exception is PTR_TO_CTX which is now allowed to be offseted freely. The intent of aforementioned commit was to allow context access via modified registers. The offset passed to ->is_valid_access() verifier callback has been adjusted by the value of the variable offset. What is missing, however, is taking the variable offset into account when the context register is used. Or in terms of the code adding the offset to the value passed to the ->convert_ctx_access() callback. This leads to the following eBPF user code: r1 += 68 r0 = *(u32 *)(r1 + 8) exit being translated to this in kernel space: 0: (07) r1 += 68 1: (61) r0 = *(u32 *)(r1 +180) 2: (95) exit Offset 8 is corresponding to 180 in the kernel, but offset 76 is valid too. Verifier will "accept" access to offset 68+8=76 but then "convert" access to offset 8 as 180. Effective access to offset 248 is beyond the kernel context. (This is a __sk_buff example on a debug-heavy kernel - packet mark is 8 -> 180, 76 would be data.) Dereferencing the modified context pointer is not as easy as dereferencing other types, because we have to translate the access to reading a field in kernel structures which is usually at a different offset and often of a different size. To allow modifying the pointer we would have to make sure that given eBPF instruction will always access the same field or the fields accessed are "compatible" in terms of offset and size... Disallow dereferencing modified context pointers and add to selftests the test case described here. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Edward Cree Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8b8d6ba39e23..20f3889c006e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1116,7 +1116,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* ctx accesses must be at a fixed offset, so that we can * determine what type of data were returned. */ - if (!tnum_is_const(reg->var_off)) { + if (reg->off) { + verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", + regno, reg->off, off - reg->off); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -1124,7 +1129,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn tn_buf, off, size); return -EACCES; } - off += reg->var_off.value; err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a -- cgit v1.2.3 From 7de16e3a35578f4f5accc6f5f23970310483d0a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:53 -0700 Subject: bpf: split verifier and program ops struct bpf_verifier_ops contains both verifier ops and operations used later during program's lifetime (test_run). Split the runtime ops into a different structure. BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops to the names. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 16 +++++++++++++--- kernel/bpf/verifier.c | 12 ++++++------ kernel/trace/bpf_trace.c | 15 ++++++++++++--- 3 files changed, 31 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 54fba06942f5..444902b5a30d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -739,9 +739,18 @@ err_put: return err; } -static const struct bpf_verifier_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _ops) \ - [_id] = &_ops, +static const struct bpf_prog_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _prog_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; + +static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) #include #undef BPF_PROG_TYPE @@ -754,6 +763,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return -EINVAL; prog->aux->ops = bpf_prog_types[type]; + prog->aux->vops = bpf_verifier_ops[type]; prog->type = type; return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e4d5136725a2..38e24d69fc95 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -856,8 +856,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, *reg_type = info.reg_type; return 0; } - } else if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + } else if (env->prog->aux->vops->is_valid_access && + env->prog->aux->vops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -1565,8 +1565,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } - if (env->prog->aux->ops->get_func_proto) - fn = env->prog->aux->ops->get_func_proto(func_id); + if (env->prog->aux->vops->get_func_proto) + fn = env->prog->aux->vops->get_func_proto(func_id); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), @@ -4035,7 +4035,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { - const struct bpf_verifier_ops *ops = env->prog->aux->ops; + const struct bpf_verifier_ops *ops = env->prog->aux->vops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; @@ -4236,7 +4236,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = prog->aux->ops->get_func_proto(insn->imm); + fn = prog->aux->vops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 04ea5314f2bc..3126da2f468a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -561,11 +561,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -const struct bpf_verifier_ops kprobe_prog_ops = { +const struct bpf_verifier_ops kprobe_verifier_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; +const struct bpf_prog_ops kprobe_prog_ops = { +}; + BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@ -667,11 +670,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -const struct bpf_verifier_ops tracepoint_prog_ops = { +const struct bpf_verifier_ops tracepoint_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -727,8 +733,11 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -const struct bpf_verifier_ops perf_event_prog_ops = { +const struct bpf_verifier_ops perf_event_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; + +const struct bpf_prog_ops perf_event_prog_ops = { +}; -- cgit v1.2.3 From 00176a34d9e27ab1e77db75fe13abc005cffe0ca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:54 -0700 Subject: bpf: remove the verifier ops from program structure Since the verifier ops don't have to be associated with the program for its entire lifetime we can move it to verifier's struct bpf_verifier_env. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 10 ---------- kernel/bpf/verifier.c | 23 +++++++++++++++++------ 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 444902b5a30d..0e893cac6795 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -748,22 +748,12 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #undef BPF_MAP_TYPE }; -static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { -#define BPF_PROG_TYPE(_id, _name) \ - [_id] = & _name ## _verifier_ops, -#define BPF_MAP_TYPE(_id, _ops) -#include -#undef BPF_PROG_TYPE -#undef BPF_MAP_TYPE -}; - static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) return -EINVAL; prog->aux->ops = bpf_prog_types[type]; - prog->aux->vops = bpf_verifier_ops[type]; prog->type = type; return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 38e24d69fc95..3b6e2c550e96 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,6 +23,15 @@ #include "disasm.h" +static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _verifier_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -856,8 +865,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, *reg_type = info.reg_type; return 0; } - } else if (env->prog->aux->vops->is_valid_access && - env->prog->aux->vops->is_valid_access(off, size, t, &info)) { + } else if (env->ops->is_valid_access && + env->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -1565,8 +1574,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } - if (env->prog->aux->vops->get_func_proto) - fn = env->prog->aux->vops->get_func_proto(func_id); + if (env->ops->get_func_proto) + fn = env->ops->get_func_proto(func_id); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), @@ -4035,7 +4044,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { - const struct bpf_verifier_ops *ops = env->prog->aux->vops; + const struct bpf_verifier_ops *ops = env->ops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; @@ -4236,7 +4245,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = prog->aux->vops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ @@ -4294,6 +4303,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->insn_aux_data) goto err_free_env; env->prog = *prog; + env->ops = bpf_verifier_ops[env->prog->type]; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -4406,6 +4416,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, if (!env->insn_aux_data) goto err_free_env; env->prog = prog; + env->ops = bpf_verifier_ops[env->prog->type]; env->analyzer_ops = ops; env->analyzer_priv = priv; -- cgit v1.2.3 From 4f9218aaf8a463f76cac40aa08d859d065f8cc9e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:55 -0700 Subject: bpf: move knowledge about post-translation offsets out of verifier Use the fact that verifier ops are now separate from program ops to define a separate set of callbacks for verification of already translated programs. Since we expect the analyzer ops to be defined only for a small subset of all program types initialize their array by hand (don't use linux/bpf_types.h). Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 55 +++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3b6e2c550e96..545b8c45a578 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -822,36 +822,6 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } -static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off, - struct bpf_insn_access_aux *info) -{ - switch (env->prog->type) { - case BPF_PROG_TYPE_XDP: - switch (off) { - case offsetof(struct xdp_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct xdp_buff, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; - case BPF_PROG_TYPE_SCHED_CLS: - switch (off) { - case offsetof(struct sk_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct sk_buff, cb) + - offsetof(struct bpf_skb_data_end, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; - default: - return false; - } -} - /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) @@ -860,13 +830,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, .reg_type = *reg_type, }; - if (env->analyzer_ops) { - if (analyzer_is_valid_access(env, off, &info)) { - *reg_type = info.reg_type; - return 0; - } - } else if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + if (env->ops->is_valid_access && + env->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -874,9 +839,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; *reg_type = info.reg_type; + if (env->analyzer_ops) + return 0; + + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -4400,12 +4368,21 @@ err_free_env: return ret; } +static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { + [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, + [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, +}; + int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, void *priv) { struct bpf_verifier_env *env; int ret; + if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) || + !bpf_analyzer_ops[prog->type]) + return -EOPNOTSUPP; + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -4416,7 +4393,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, if (!env->insn_aux_data) goto err_free_env; env->prog = prog; - env->ops = bpf_verifier_ops[env->prog->type]; + env->ops = bpf_analyzer_ops[env->prog->type]; env->analyzer_ops = ops; env->analyzer_priv = priv; -- cgit v1.2.3 From 4f3a871443669c6b4d458a60ac8d8ca5eedc3f97 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Tue, 17 Oct 2017 13:48:34 +0530 Subject: Revert "kprobes: Warn if optprobe handler tries to change execution path" This reverts commit: e863d539614641 ("kprobes: Warn if optprobe handler tries to change execution path") On PowerPC, we place a probe at kretprobe_trampoline to catch function returns and with CONFIG_OPTPROBES=y, this probe gets optimized. This works for us due to the way we handle the optprobe as described in commit: 762df10bad6954 ("powerpc/kprobes: Optimize kprobe in kretprobe_trampoline()") With the above commit, we end up with a warning. As such, revert this change. Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171017081834.3629-1-naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d28377a0e32..15fba7fe57c8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -387,10 +387,7 @@ void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) list_for_each_entry_rcu(kp, &p->list, list) { if (kp->pre_handler && likely(!kprobe_disabled(kp))) { set_kprobe_instance(kp); - if (kp->pre_handler(kp, regs)) { - if (WARN_ON_ONCE(1)) - pr_err("Optprobe ignores instruction pointer changing.(%pF)\n", p->addr); - } + kp->pre_handler(kp, regs); } reset_kprobe_instance(); } -- cgit v1.2.3 From c310ce4dcb9df9b2f1be82caff7dae609fe53f72 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sun, 8 Oct 2017 20:55:59 -0700 Subject: timers: Avoid an unnecessary iteration in __run_timers() If the base clock is behind jiffies in the soft irq expiry code then the next timer is retrieved by get_next_timer_interrupt() to avoid incrementing base clock one by one. If the next timer interrupt is past current jiffies then the base clock is set to jiffies - 1. At the call site this is incremented and another iteration through the expiry loop is executed which checks empty hash buckets. That's a pointless excercise because it's already known that the next timer is past jiffies. Set the base clock in that case to jiffies directly so it gets incremented to jiffies + 1 at the call site resulting in immediate termination of the expiry loop. [ tglx: Massaged changelog and added comment to the code ] Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Acked-by: Anna-Maria Gleixner Cc: Joe Jin Cc: sboyd@codeaurora.org Cc: Srinivas Reddy Eeda Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/7086a857-f90c-4616-bbe8-f7696f21626c@default --- kernel/time/timer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 38613ced2324..ee1a88d8afb2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1560,8 +1560,11 @@ static int collect_expired_timers(struct timer_base *base, * jiffies, otherwise forward to the next expiry time: */ if (time_after(next, jiffies)) { - /* The call site will increment clock! */ - base->clk = jiffies - 1; + /* + * The call site will increment base->clk and then + * terminate the expiry loop immediately. + */ + base->clk = jiffies; return 0; } base->clk = next; -- cgit v1.2.3 From 2b5175c4fa974b6aa05bbd2ee8d443a8036a1714 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Oct 2017 09:54:57 +0200 Subject: genirq: Add config option for reservation mode The interrupt reservation mode requires reactivation of PCI/MSI interrupts. Create a config option, so the PCI code can set the corresponding flag when required. Signed-off-by: Thomas Gleixner Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Link: https://lkml.kernel.org/r/20171017075600.369375409@linutronix.de --- kernel/irq/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index ac1a3e29d3b9..89e355866450 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -100,6 +100,9 @@ config IRQ_TIMINGS config GENERIC_IRQ_MATRIX_ALLOCATOR bool +config GENERIC_IRQ_RESERVATION_MODE + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS -- cgit v1.2.3 From 32a6c7233c41216f5dd41fc7bf100eedb1063dfc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 15:58:25 -0700 Subject: workqueue: Convert timers to use timer_setup() (part 2) In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. (The prior workqueue patch missed a few timers.) Signed-off-by: Kees Cook Acked-by: Tejun Heo Cc: Lai Jiangshan Link: https://lkml.kernel.org/r/20171016225825.GA99101@beast Signed-off-by: Thomas Gleixner --- kernel/workqueue.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c77fdf6bf24f..6e5eed58f215 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1831,9 +1831,9 @@ static void destroy_worker(struct worker *worker) wake_up_process(worker->task); } -static void idle_worker_timeout(unsigned long __pool) +static void idle_worker_timeout(struct timer_list *t) { - struct worker_pool *pool = (void *)__pool; + struct worker_pool *pool = from_timer(pool, t, idle_timer); spin_lock_irq(&pool->lock); @@ -1879,9 +1879,9 @@ static void send_mayday(struct work_struct *work) } } -static void pool_mayday_timeout(unsigned long __pool) +static void pool_mayday_timeout(struct timer_list *t) { - struct worker_pool *pool = (void *)__pool; + struct worker_pool *pool = from_timer(pool, t, mayday_timer); struct work_struct *work; spin_lock_irq(&pool->lock); @@ -3241,11 +3241,9 @@ static int init_worker_pool(struct worker_pool *pool) INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); - setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout, - (unsigned long)pool); + timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); + timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); mutex_init(&pool->manager_arb); mutex_init(&pool->attach_mutex); -- cgit v1.2.3 From ba16490eac146ebb178017e5de3d61c645552fab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Oct 2017 16:10:19 +0200 Subject: timer: Convert stub timer to timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Signed-off-by: Thomas Gleixner Cc: Kees Cook --- kernel/time/timer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ee1a88d8afb2..fbb1f85327bf 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -610,7 +610,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state) } /* Stub timer callback for improperly used timers. */ -static void stub_timer(unsigned long data) +static void stub_timer(struct timer_list *unused) { WARN_ON(1); } @@ -626,7 +626,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - setup_timer(timer, stub_timer, 0); + timer_setup(timer, stub_timer, 0); return true; case ODEBUG_STATE_ACTIVE: @@ -665,7 +665,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - setup_timer(timer, stub_timer, 0); + timer_setup(timer, stub_timer, 0); return true; default: return false; -- cgit v1.2.3 From 5cdda5117e125e0dbb020425cc55a4c143c6febc Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 18 Oct 2017 17:24:28 +0200 Subject: locking/static_keys: Improve uninitialized key warning Right now it says: static_key_disable_cpuslocked used before call to jump_label_init ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:161 static_key_disable_cpuslocked+0x68/0x70 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 4.14.0-rc5+ #1 Hardware name: SGI.COM C2112-4GP3/X10DRT-P-Series, BIOS 2.0a 05/09/2016 task: ffffffff81c0e480 task.stack: ffffffff81c00000 RIP: 0010:static_key_disable_cpuslocked+0x68/0x70 RSP: 0000:ffffffff81c03ef0 EFLAGS: 00010096 ORIG_RAX: 0000000000000000 RAX: 0000000000000041 RBX: ffffffff81c32680 RCX: ffffffff81c5cbf8 RDX: 0000000000000001 RSI: 0000000000000092 RDI: 0000000000000002 RBP: ffff88807fffd240 R08: 726f666562206465 R09: 0000000000000136 R10: 0000000000000000 R11: 696e695f6c656261 R12: ffffffff82158900 R13: ffffffff8215f760 R14: 0000000000000001 R15: 0000000000000008 FS: 0000000000000000(0000) GS:ffff883f7f400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff88807ffff000 CR3: 0000000001c09000 CR4: 00000000000606b0 Call Trace: static_key_disable+0x16/0x20 start_kernel+0x15a/0x45d ? load_ucode_intel_bsp+0x11/0x2d secondary_startup_64+0xa5/0xb0 Code: 48 c7 c7 a0 15 cf 81 e9 47 53 4b 00 48 89 df e8 5f fc ff ff eb e8 48 c7 c6 \ c0 97 83 81 48 c7 c7 d0 ff a2 81 31 c0 e8 c5 9d f5 ff <0f> ff eb a7 0f ff eb \ b0 e8 eb a2 4b 00 53 48 89 fb e8 42 0e f0 but it doesn't tell me which key it is. So dump the key's name too: static_key_disable_cpuslocked(): static key 'virt_spin_lock_key' used before call to jump_label_init() And that makes pinpointing which key is causing that a lot easier. include/linux/jump_label.h | 14 +++++++------- include/linux/jump_label_ratelimit.h | 6 +++--- kernel/jump_label.c | 14 +++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) Signed-off-by: Borislav Petkov Reviewed-by: Steven Rostedt (VMware) Cc: Boris Ostrovsky Cc: Hannes Frederic Sowa Cc: Jason Baron Cc: Juergen Gross Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171018152428.ffjgak4o25f7ept6@pd.tnic Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0bf2e8f5244a..8ff4ca4665ff 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -83,7 +83,7 @@ static void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); /* * Careful if we get concurrent static_key_slow_inc() calls; @@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); void static_key_enable_cpuslocked(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(static_key_enable); void static_key_disable_cpuslocked(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); @@ -224,21 +224,21 @@ static void jump_label_update_timeout(struct work_struct *work) void static_key_slow_dec(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(key, 0, NULL); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_deferred(struct static_key_deferred *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(&key->key, key->timeout, &key->work); } EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); void static_key_deferred_flush(struct static_key_deferred *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); flush_delayed_work(&key->work); } EXPORT_SYMBOL_GPL(static_key_deferred_flush); @@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } -- cgit v1.2.3 From 93862e385ded7c60351e09fcd2a541d273650905 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 13 Oct 2017 15:08:41 -0400 Subject: livepatch: add (un)patch callbacks Provide livepatch modules a klp_object (un)patching notification mechanism. Pre and post-(un)patch callbacks allow livepatch modules to setup or synchronize changes that would be difficult to support in only patched-or-unpatched code contexts. Callbacks can be registered for target module or vmlinux klp_objects, but each implementation is klp_object specific. - Pre-(un)patch callbacks run before any (un)patching transition starts. - Post-(un)patch callbacks run once an object has been (un)patched and the klp_patch fully transitioned to its target state. Example use cases include modification of global data and registration of newly available services/handlers. See Documentation/livepatch/callbacks.txt for details and samples/livepatch/ for examples. Signed-off-by: Joe Lawrence Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 51 ++++++++++++++++++++++++++++++++++--------- kernel/livepatch/core.h | 38 ++++++++++++++++++++++++++++++++ kernel/livepatch/patch.c | 1 + kernel/livepatch/transition.c | 21 +++++++++++++++--- 4 files changed, 98 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index b9628e43c78f..cafb5a84417d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj) return obj->name; } -static bool klp_is_object_loaded(struct klp_object *obj) -{ - return !obj->name || obj->mod; -} - /* sets obj->mod if object is not vmlinux and module is found */ static void klp_find_object_module(struct klp_object *obj) { @@ -285,6 +280,8 @@ static int klp_write_object_relocations(struct module *pmod, static int __klp_disable_patch(struct klp_patch *patch) { + struct klp_object *obj; + if (klp_transition_patch) return -EBUSY; @@ -295,6 +292,10 @@ static int __klp_disable_patch(struct klp_patch *patch) klp_init_transition(patch, KLP_UNPATCHED); + klp_for_each_object(patch, obj) + if (patch->enabled && obj->patched) + klp_pre_unpatch_callback(obj); + /* * Enforce the order of the func->transition writes in * klp_init_transition() and the TIF_PATCH_PENDING writes in @@ -388,13 +389,18 @@ static int __klp_enable_patch(struct klp_patch *patch) if (!klp_is_object_loaded(obj)) continue; - ret = klp_patch_object(obj); + ret = klp_pre_patch_callback(obj); if (ret) { - pr_warn("failed to enable patch '%s'\n", - patch->mod->name); + pr_warn("pre-patch callback failed for object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } - klp_cancel_transition(); - return ret; + ret = klp_patch_object(obj); + if (ret) { + pr_warn("failed to patch object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; } } @@ -403,6 +409,11 @@ static int __klp_enable_patch(struct klp_patch *patch) patch->enabled = true; return 0; +err: + pr_warn("failed to enable patch '%s'\n", patch->mod->name); + + klp_cancel_transition(); + return ret; } /** @@ -871,13 +882,27 @@ int klp_module_coming(struct module *mod) pr_notice("applying patch '%s' to loading module '%s'\n", patch->mod->name, obj->mod->name); + ret = klp_pre_patch_callback(obj); + if (ret) { + pr_warn("pre-patch callback failed for object '%s'\n", + obj->name); + goto err; + } + ret = klp_patch_object(obj); if (ret) { pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", patch->mod->name, obj->mod->name, ret); + + if (patch != klp_transition_patch) + klp_post_unpatch_callback(obj); + goto err; } + if (patch != klp_transition_patch) + klp_post_patch_callback(obj); + break; } } @@ -927,9 +952,15 @@ void klp_module_going(struct module *mod) * is in transition. */ if (patch->enabled || patch == klp_transition_patch) { + + if (patch != klp_transition_patch) + klp_pre_unpatch_callback(obj); + pr_notice("reverting patch '%s' on unloading module '%s'\n", patch->mod->name, obj->mod->name); klp_unpatch_object(obj); + + klp_post_unpatch_callback(obj); } klp_free_object_loaded(obj); diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index c74f24c47837..6fc907b54e71 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -1,6 +1,44 @@ #ifndef _LIVEPATCH_CORE_H #define _LIVEPATCH_CORE_H +#include + extern struct mutex klp_mutex; +static inline bool klp_is_object_loaded(struct klp_object *obj) +{ + return !obj->name || obj->mod; +} + +static inline int klp_pre_patch_callback(struct klp_object *obj) +{ + int ret; + + ret = (obj->callbacks.pre_patch) ? + (*obj->callbacks.pre_patch)(obj) : 0; + + obj->callbacks.post_unpatch_enabled = !ret; + + return ret; +} + +static inline void klp_post_patch_callback(struct klp_object *obj) +{ + if (obj->callbacks.post_patch) + (*obj->callbacks.post_patch)(obj); +} + +static inline void klp_pre_unpatch_callback(struct klp_object *obj) +{ + if (obj->callbacks.pre_unpatch) + (*obj->callbacks.pre_unpatch)(obj); +} + +static inline void klp_post_unpatch_callback(struct klp_object *obj) +{ + if (obj->callbacks.post_unpatch_enabled && + obj->callbacks.post_unpatch) + (*obj->callbacks.post_unpatch)(obj); +} + #endif /* _LIVEPATCH_CORE_H */ diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 52c4e907c14b..82d584225dc6 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -28,6 +28,7 @@ #include #include #include +#include "core.h" #include "patch.h" #include "transition.h" diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index b004a1fb6032..7bf55b7f3687 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -109,9 +109,6 @@ static void klp_complete_transition(void) } } - if (klp_target_state == KLP_UNPATCHED && !immediate_func) - module_put(klp_transition_patch->mod); - /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ if (klp_target_state == KLP_PATCHED) klp_synchronize_transition(); @@ -130,6 +127,24 @@ static void klp_complete_transition(void) } done: + klp_for_each_object(klp_transition_patch, obj) { + if (!klp_is_object_loaded(obj)) + continue; + if (klp_target_state == KLP_PATCHED) + klp_post_patch_callback(obj); + else if (klp_target_state == KLP_UNPATCHED) + klp_post_unpatch_callback(obj); + } + + /* + * See complementary comment in __klp_enable_patch() for why we + * keep the module reference for immediate patches. + */ + if (!klp_transition_patch->immediate && !immediate_func && + klp_target_state == KLP_UNPATCHED) { + module_put(klp_transition_patch->mod); + } + klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; } -- cgit v1.2.3 From 6116c3033a761611b1da980ea664c6ddff3eaed6 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 13 Oct 2017 15:08:42 -0400 Subject: livepatch: move transition "complete" notice into klp_complete_transition() klp_complete_transition() performs a bit of housework before a transition to KLP_PATCHED or KLP_UNPATCHED is actually completed (including post-(un)patch callbacks). To be consistent, move the transition "complete" kernel log notice out of klp_try_complete_transition() and into klp_complete_transition(). Suggested-by: Josh Poimboeuf Acked-by: Josh Poimboeuf Signed-off-by: Joe Lawrence Acked-by: Miroslav Benes Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/transition.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 7bf55b7f3687..53887f0bca10 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -136,6 +136,9 @@ done: klp_post_unpatch_callback(obj); } + pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + /* * See complementary comment in __klp_enable_patch() for why we * keep the module reference for immediate patches. @@ -423,9 +426,6 @@ void klp_try_complete_transition(void) } success: - pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* we're done, now cleanup the data structures */ klp_complete_transition(); } -- cgit v1.2.3 From af026796054fb70439e919a925615e61b500ef6b Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 13 Oct 2017 15:08:43 -0400 Subject: livepatch: add transition notices Log a few kernel debug messages at the beginning of the following livepatch transition functions: klp_complete_transition() klp_cancel_transition() klp_init_transition() klp_reverse_transition() Also update the log notice message in klp_start_transition() for similar verbiage as the above messages. Suggested-by: Josh Poimboeuf Signed-off-by: Joe Lawrence Acked-by: Miroslav Benes Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/transition.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 53887f0bca10..56add6327736 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -82,6 +82,10 @@ static void klp_complete_transition(void) unsigned int cpu; bool immediate_func = false; + pr_debug("'%s': completing %s transition\n", + klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + if (klp_target_state == KLP_UNPATCHED) { /* * All tasks have transitioned to KLP_UNPATCHED so we can now @@ -163,6 +167,9 @@ void klp_cancel_transition(void) if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED)) return; + pr_debug("'%s': canceling patching transition, going to unpatch\n", + klp_transition_patch->mod->name); + klp_target_state = KLP_UNPATCHED; klp_complete_transition(); } @@ -441,7 +448,8 @@ void klp_start_transition(void) WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); - pr_notice("'%s': %s...\n", klp_transition_patch->mod->name, + pr_notice("'%s': starting %s transition\n", + klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); /* @@ -497,6 +505,9 @@ void klp_init_transition(struct klp_patch *patch, int state) */ klp_target_state = state; + pr_debug("'%s': initializing %s transition\n", patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + /* * If the patch can be applied or reverted immediately, skip the * per-task transitions. @@ -562,6 +573,11 @@ void klp_reverse_transition(void) unsigned int cpu; struct task_struct *g, *task; + pr_debug("'%s': reversing transition from %s\n", + klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching to unpatching" : + "unpatching to patching"); + klp_transition_patch->enabled = !klp_transition_patch->enabled; klp_target_state = !klp_target_state; -- cgit v1.2.3 From f1d783585486c7c612f277c2a6f0c9bb5a67e463 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 Oct 2017 10:44:54 +0900 Subject: irqdomain: Move revmap_trees_mutex to struct irq_domain The revmap_trees_mutex protects domain->revmap_tree. There is no need to make it global because it is allowed to modify revmap_tree of two different domains concurrently. Having said that, this would not be a actual bottleneck because the interrupt map/unmap does not occur quite often. Rather, the motivation is to tidy up the code from a data structure point of view. Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b50f737574ae..31d0f9ff4f00 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -21,7 +21,6 @@ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); -static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; static void irq_domain_check_hierarchy(struct irq_domain *domain); @@ -211,6 +210,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); + mutex_init(&domain->revmap_tree_mutex); domain->ops = ops; domain->host_data = host_data; domain->hwirq_max = hwirq_max; @@ -462,9 +462,9 @@ static void irq_domain_clear_mapping(struct irq_domain *domain, if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = 0; } else { - mutex_lock(&revmap_trees_mutex); + mutex_lock(&domain->revmap_tree_mutex); radix_tree_delete(&domain->revmap_tree, hwirq); - mutex_unlock(&revmap_trees_mutex); + mutex_unlock(&domain->revmap_tree_mutex); } } @@ -475,9 +475,9 @@ static void irq_domain_set_mapping(struct irq_domain *domain, if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = irq_data->irq; } else { - mutex_lock(&revmap_trees_mutex); + mutex_lock(&domain->revmap_tree_mutex); radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); + mutex_unlock(&domain->revmap_tree_mutex); } } @@ -1459,11 +1459,11 @@ static void irq_domain_fix_revmap(struct irq_data *d) return; /* Not using radix tree. */ /* Fix up the revmap. */ - mutex_lock(&revmap_trees_mutex); + mutex_lock(&d->domain->revmap_tree_mutex); slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq); if (slot) radix_tree_replace_slot(&d->domain->revmap_tree, slot, d); - mutex_unlock(&revmap_trees_mutex); + mutex_unlock(&d->domain->revmap_tree_mutex); } /** -- cgit v1.2.3 From d03cc2d8aed3bfbdfc1f52e79054e0013b5721ca Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 Sep 2017 21:20:41 +0900 Subject: irqdomain: Add __rcu annotations to radix tree slot Fix different address spaces warning of sparse. kernel/irq/irqdomain.c:1463:14: warning: incorrect type in assignment (different address spaces) kernel/irq/irqdomain.c:1463:14: expected void **slot kernel/irq/irqdomain.c:1463:14: got void [noderef] ** kernel/irq/irqdomain.c:1465:66: warning: incorrect type in argument 2 (different address spaces) kernel/irq/irqdomain.c:1465:66: expected void [noderef] **slot kernel/irq/irqdomain.c:1465:66: got void **slot Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 31d0f9ff4f00..fbbf34293b17 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private) struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void **slot; + void __rcu **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -1453,7 +1453,7 @@ out_free_desc: /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { - void **slot; + void __rcu **slot; if (d->hwirq < d->domain->revmap_size) return; /* Not using radix tree. */ -- cgit v1.2.3 From 9ad0457423af877ad1b76c105a57130da028ccad Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 6 Oct 2017 16:27:26 +0200 Subject: kernel/module: Delete an error message for a failed memory allocation in add_module_usage() Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Jessica Yu --- kernel/module.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..07ef44767245 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -837,10 +837,8 @@ static int add_module_usage(struct module *a, struct module *b) pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - pr_warn("%s: out of memory loading\n", a->name); + if (!use) return -ENOMEM; - } use->source = a; use->target = b; -- cgit v1.2.3 From 82f8dd28bd3abe181b7a66ea4ea132134d37a400 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 17 Oct 2017 16:55:53 +0200 Subject: bpf: fix splat for illegal devmap percpu allocation It was reported that syzkaller was able to trigger a splat on devmap percpu allocation due to illegal/unsupported allocation request size passed to __alloc_percpu(): [ 70.094249] illegal size (32776) or align (8) for percpu allocation [ 70.094256] ------------[ cut here ]------------ [ 70.094259] WARNING: CPU: 3 PID: 3451 at mm/percpu.c:1365 pcpu_alloc+0x96/0x630 [...] [ 70.094325] Call Trace: [ 70.094328] __alloc_percpu_gfp+0x12/0x20 [ 70.094330] dev_map_alloc+0x134/0x1e0 [ 70.094331] SyS_bpf+0x9bc/0x1610 [ 70.094333] ? selinux_task_setrlimit+0x5a/0x60 [ 70.094334] ? security_task_setrlimit+0x43/0x60 [ 70.094336] entry_SYSCALL_64_fastpath+0x1a/0xa5 This was due to too large max_entries for the map such that we surpassed the upper limit of PCPU_MIN_UNIT_SIZE. It's fine to fail naturally here, so switch to __alloc_percpu_gfp() and pass __GFP_NOWARN instead. Fixes: 11393cc9b9be ("xdp: Add batching support to redirect map") Reported-by: Mark Rutland Reported-by: Shankara Pailoor Reported-by: Richard Weinberger Signed-off-by: Daniel Borkmann Cc: John Fastabend Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e093d9a2c4dd..920428d84da2 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -111,8 +111,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) err = -ENOMEM; /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), - __alignof__(unsigned long)); + dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), + __alignof__(unsigned long), + GFP_KERNEL | __GFP_NOWARN); if (!dtab->flush_needed) goto free_dtab; -- cgit v1.2.3 From bc6d5031b43a2291de638ab9304320b4cae61689 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 17 Oct 2017 16:55:54 +0200 Subject: bpf: do not test for PCPU_MIN_UNIT_SIZE before percpu allocations PCPU_MIN_UNIT_SIZE is an implementation detail of the percpu allocator. Given we support __GFP_NOWARN now, lets just let the allocation request fail naturally instead. The two call sites from BPF mistakenly assumed __GFP_NOWARN would work, so no changes needed to their actual __alloc_percpu_gfp() calls which use the flag already. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/hashtab.c | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..e2636737b69b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -98,7 +98,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); if (array_size >= U32_MAX - PAGE_SIZE || - elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..6533f08d1238 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -317,10 +317,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) - /* make sure the size for pcpu_alloc() is reasonable */ - goto free_htab; - htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) -- cgit v1.2.3 From 3a29ddb1c5986a6d3f941bfb1f434105203ce7f6 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 19 Oct 2017 15:17:23 +0100 Subject: clockevents: Retry programming min delta up to 10 times When CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=n, the call path hrtimer_reprogram -> clockevents_program_event -> clockevents_program_min_delta will not retry if the clock event driver returns -ETIME. If the driver could not satisfy the program_min_delta for any reason, the lack of a retry means the CPU may not receive a tick interrupt, potentially until the counter does a full period. This leads to rcu_sched timeout messages as the stalled CPU is detected by other CPUs, and other issues if the CPU is holding locks or other resources at the point at which it stalls. There have been a couple of observed mechanisms through which a clock event driver could not satisfy the requested min_delta and return -ETIME. With the MIPS GIC driver, the shared execution resource within MT cores means inconventient latency due to execution of instructions from other hardware threads in the core, within gic_next_event, can result in an event being set in the past. Additionally under virtualisation it is possible to get unexpected latency during a clockevent device's set_next_event() callback which can make it return -ETIME even for a delta based on min_delta_ns. It isn't appropriate to use MIN_ADJUST in the virtualisation case as occasional hypervisor induced high latency will cause min_delta_ns to quickly increase to the maximum. Instead, borrow the retry pattern from the MIN_ADJUST case, but without making adjustments. Retry up to 10 times, each time increasing the attempted delta by min_delta, before giving up. [ Matt: Reworked the loop and made retry increase the delta. ] Signed-off-by: James Hogan Signed-off-by: Matt Redfearn Signed-off-by: Thomas Gleixner Cc: linux-mips@linux-mips.org Cc: Daniel Lezcano Cc: "Martin Schwidefsky" Cc: James Hogan Link: https://lkml.kernel.org/r/1508422643-6075-1-git-send-email-matt.redfearn@mips.com --- kernel/time/clockevents.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 4237e0744e26..16c027e9cc73 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -280,17 +280,22 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; - int64_t delta; + int64_t delta = 0; + int i; - delta = dev->min_delta_ns; - dev->next_event = ktime_add_ns(ktime_get(), delta); + for (i = 0; i < 10; i++) { + delta += dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); - if (clockevent_state_shutdown(dev)) - return 0; + if (clockevent_state_shutdown(dev)) + return 0; - dev->retries++; - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - return dev->set_next_event((unsigned long) clc, dev); + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + } + return -ETIME; } #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ -- cgit v1.2.3 From b88697810d7c1d102a529990f9071b0f14cfe6df Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 18 Oct 2017 08:33:44 -0700 Subject: rcu: Do not include rtmutex_common.h unconditionally This commit adjusts include files and provides definitions in preparation for suppressing lockdep false-positive ->boost_mtx complaints. Without this preparation, architectures not supporting rt_mutex will get build failures. Reported-by: kbuild test robot Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fed95fa941e6..969eae45f05d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -54,6 +54,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); * This probably needs to be excluded from -rt builds. */ #define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) +#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) #endif /* #else #ifdef CONFIG_RCU_BOOST */ @@ -911,8 +912,6 @@ void exit_rcu(void) #ifdef CONFIG_RCU_BOOST -#include "../locking/rtmutex_common.h" - static void rcu_wake_cond(struct task_struct *t, int status) { /* -- cgit v1.2.3 From 02a7c234e54052101164368ff981bd72f7acdd65 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Sep 2017 15:36:42 -0700 Subject: rcu: Suppress lockdep false-positive ->boost_mtx complaints RCU priority boosting uses rt_mutex_init_proxy_locked() to initialize an rt_mutex structure in locked state held by some other task. When that other task releases it, lockdep complains (quite accurately, but a bit uselessly) that the other task never acquired it. This complaint can suppress other, more helpful, lockdep complaints, and in any case it is a false positive. This commit therefore switches from rt_mutex_unlock() to rt_mutex_futex_unlock(), thereby avoiding the lockdep annotations. Of course, if lockdep ever learns about rt_mutex_init_proxy_locked(), addtional adjustments will be required. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 969eae45f05d..1eaab96d1a3c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -531,7 +531,7 @@ void rcu_read_unlock_special(struct task_struct *t) /* Unboost if we were boosted. */ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_unlock(&rnp->boost_mtx); + rt_mutex_futex_unlock(&rnp->boost_mtx); /* * If this was the last task on the expedited lists, -- cgit v1.2.3 From c0da313e090d6e7130d0c3005245176296c24e4a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Sep 2017 09:58:47 -0700 Subject: rcu: Add extended-quiescent-state testing advice If you add or remove calls to rcu_idle_enter(), rcu_user_enter(), rcu_irq_exit(), rcu_irq_exit_irqson(), rcu_idle_exit(), rcu_user_exit(), rcu_irq_enter(), rcu_irq_enter_irqson(), rcu_nmi_enter(), or rcu_nmi_exit(), you should run a full set of tests on a kernel built with CONFIG_RCU_EQS_DEBUG=y. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0ad62b0e7b8..d234356d7afc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -837,6 +837,9 @@ static void rcu_eqs_enter(bool user) * We crowbar the ->dynticks_nesting field to zero to allow for * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. + * + * If you add or remove a call to rcu_idle_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_idle_enter(void) { @@ -852,6 +855,9 @@ void rcu_idle_enter(void) * is permitted between this call and rcu_user_exit(). This way the * CPU doesn't need to maintain the tick for RCU maintenance purposes * when the CPU runs in userspace. + * + * If you add or remove a call to rcu_user_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_user_enter(void) { @@ -875,6 +881,9 @@ void rcu_user_enter(void) * Use things like work queues to work around this limitation. * * You have been warned. + * + * If you add or remove a call to rcu_irq_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_exit(void) { @@ -899,6 +908,9 @@ void rcu_irq_exit(void) /* * Wrapper for rcu_irq_exit() where interrupts are enabled. + * + * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_exit_irqson(void) { @@ -971,6 +983,9 @@ static void rcu_eqs_exit(bool user) * allow for the possibility of usermode upcalls messing up our count * of interrupt nesting level during the busy period that is just * now starting. + * + * If you add or remove a call to rcu_idle_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_idle_exit(void) { @@ -987,6 +1002,9 @@ void rcu_idle_exit(void) * * Exit RCU idle mode while entering the kernel because it can * run a RCU read side critical section anytime. + * + * If you add or remove a call to rcu_user_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_user_exit(void) { @@ -1012,6 +1030,9 @@ void rcu_user_exit(void) * Use things like work queues to work around this limitation. * * You have been warned. + * + * If you add or remove a call to rcu_irq_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_enter(void) { @@ -1037,6 +1058,9 @@ void rcu_irq_enter(void) /* * Wrapper for rcu_irq_enter() where interrupts are enabled. + * + * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_enter_irqson(void) { @@ -1055,6 +1079,9 @@ void rcu_irq_enter_irqson(void) * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) + * + * If you add or remove a call to rcu_nmi_enter(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_nmi_enter(void) { @@ -1087,6 +1114,9 @@ void rcu_nmi_enter(void) * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_nmi_exit(void) { -- cgit v1.2.3 From 56628a7fc84a6e3c4c84886d70ec26da3d7f2ce4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 22 Sep 2017 17:28:06 +0200 Subject: rcu/segcblist: Include rcupdate.h The RT build on ARM complains about non-existing ULONG_CMP_LT. This commit therefore includes rcupdate.h into rcu_segcblist.c. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7649fcd2c4c7..88cba7c2956c 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rcu_segcblist.h" -- cgit v1.2.3 From a961e40917fb14614d368d8bc9782ca4d6a8cd11 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Oct 2017 13:30:15 -0400 Subject: membarrier: Provide register expedited private command This introduces a "register private expedited" membarrier command which allows eventual removal of important memory barrier constraints on the scheduler fast-paths. It changes how the "private expedited" membarrier command (new to 4.14) is used from user-space. This new command allows processes to register their intent to use the private expedited command. This affects how the expedited private command introduced in 4.14-rc is meant to be used, and should be merged before 4.14 final. Processes are now required to register before using MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM. This fixes a problem that arose when designing requested extensions to sys_membarrier() to allow JITs to efficiently flush old code from instruction caches. Several potential algorithms are much less painful if the user register intent to use this functionality early on, for example, before the process spawns the second thread. Registering at this time removes the need to interrupt each and every thread in that process at the first expedited sys_membarrier() system call. Signed-off-by: Mathieu Desnoyers Acked-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Alexander Viro Signed-off-by: Linus Torvalds --- kernel/sched/membarrier.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index a92fddc22747..dd7908743dab 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "sched.h" /* for cpu_rq(). */ @@ -26,21 +27,26 @@ * except MEMBARRIER_CMD_QUERY. */ #define MEMBARRIER_CMD_BITMASK \ - (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ } -static void membarrier_private_expedited(void) +static int membarrier_private_expedited(void) { int cpu; bool fallback = false; cpumask_var_t tmpmask; + if (!(atomic_read(¤t->mm->membarrier_state) + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) + return -EPERM; + if (num_online_cpus() == 1) - return; + return 0; /* * Matches memory barriers around rq->curr modification in @@ -94,6 +100,24 @@ static void membarrier_private_expedited(void) * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ + return 0; +} + +static void membarrier_register_private_expedited(void) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + + /* + * We need to consider threads belonging to different thread + * groups, which use the same mm. (CLONE_VM but not + * CLONE_THREAD). + */ + if (atomic_read(&mm->membarrier_state) + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) + return; + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + &mm->membarrier_state); } /** @@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) synchronize_sched(); return 0; case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - membarrier_private_expedited(); + return membarrier_private_expedited(); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: + membarrier_register_private_expedited(); return 0; default: return -EINVAL; -- cgit v1.2.3 From 27fdb35fe99011d86bcc54f62fe84712c53f4d05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Oct 2017 14:26:21 -0700 Subject: doc: Fix various RCU docbook comment-header problems Because many of RCU's files have not been included into docbook, a number of errors have accumulated. This commit fixes them. Signed-off-by: Paul E. McKenney Signed-off-by: Linus Torvalds --- kernel/rcu/srcutree.c | 2 +- kernel/rcu/sync.c | 9 ++++++--- kernel/rcu/tree.c | 18 ++++++++++-------- 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 729a8706751d..6d5880089ff6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, /** * call_srcu() - Queue a callback for invocation after an SRCU grace period * @sp: srcu_struct in queue the callback - * @head: structure to be used for queueing the SRCU callback. + * @rhp: structure to be used for queueing the SRCU callback. * @func: function to be invoked after the SRCU grace period * * The callback function will be invoked some time after a full SRCU diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 50d1861f7759..3f943efcf61c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) } /** + * rcu_sync_enter_start - Force readers onto slow path for multiple updates + * @rsp: Pointer to rcu_sync structure to use for synchronization + * * Must be called after rcu_sync_init() and before first use. * * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() @@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) /** * rcu_sync_func() - Callback function managing reader access to fastpath - * @rsp: Pointer to rcu_sync structure to use for synchronization + * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization * * This function is passed to one of the call_rcu() functions by * rcu_sync_exit(), so that it is invoked after a grace period following the @@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp) * rcu_sync_exit(). Otherwise, set all state back to idle so that readers * can again use their fastpaths. */ -static void rcu_sync_func(struct rcu_head *rcu) +static void rcu_sync_func(struct rcu_head *rhp) { - struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); + struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; BUG_ON(rsp->gp_state != GP_PASSED); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0ad62b0e7b8..3e3650e94ae6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, * read-side critical sections have completed. call_rcu_sched() assumes * that the read-side critical sections end on enabling of preemption * or on voluntary preemption. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR - * - anything that disables preemption. + * RCU read-side critical sections are delimited by: + * + * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR + * - anything that disables preemption. * * These may be nested. * @@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * handler. This means that read-side critical sections in process * context must not be interrupted by softirqs. This interface is to be * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. - * OR - * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. - * These may be nested. + * RCU read-side critical sections are delimited by: + * + * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR + * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. + * + * These may be nested. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. -- cgit v1.2.3 From a30b85df7d599f626973e9cd3056fe755bd778e0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 20 Oct 2017 08:43:39 +0900 Subject: kprobes: Use synchronize_rcu_tasks() for optprobe with CONFIG_PREEMPT=y We want to wait for all potentially preempted kprobes trampoline execution to have completed. This guarantees that any freed trampoline memory is not in use by any task in the system anymore. synchronize_rcu_tasks() gives such a guarantee, so use it. Also, this guarantees to wait for all potentially preempted tasks on the instructions which will be replaced with a jump. Since this becomes a problem only when CONFIG_PREEMPT=y, enable CONFIG_TASKS_RCU=y for synchronize_rcu_tasks() in that case. Signed-off-by: Masami Hiramatsu Acked-by: Paul E. McKenney Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Naveen N . Rao Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150845661962.5443.17724352636247312231.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 15fba7fe57c8..a8fc1492b308 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -573,13 +573,15 @@ static void kprobe_optimizer(struct work_struct *work) do_unoptimize_kprobes(); /* - * Step 2: Wait for quiesence period to ensure all running interrupts - * are done. Because optprobe may modify multiple instructions - * there is a chance that Nth instruction is interrupted. In that - * case, running interrupt can return to 2nd-Nth byte of jump - * instruction. This wait is for avoiding it. + * Step 2: Wait for quiesence period to ensure all potentially + * preempted tasks to have normally scheduled. Because optprobe + * may modify multiple instructions, there is a chance that Nth + * instruction is preempted. In that case, such tasks can return + * to 2nd-Nth byte of jump instruction. This wait is for avoiding it. + * Note that on non-preemptive kernel, this is transparently converted + * to synchronoze_sched() to wait for all interrupts to have completed. */ - synchronize_sched(); + synchronize_rcu_tasks(); /* Step 3: Optimize kprobes after quiesence period */ do_optimize_kprobes(); -- cgit v1.2.3 From 590c845930457d25d27dc1fdd964a1ce18ef2d7d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Oct 2017 08:14:37 +0900 Subject: kprobes: Disable the jprobes APIs Disable the jprobes APIs and comment out the jprobes API function code. This is in preparation of removing all jprobes related code (including kprobe's break_handler). Nowadays ftrace and other tracing features are mature enough to replace jprobes use-cases. Users can safely use ftrace and perf probe etc. for their use cases. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Ian McDonald Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vlad Yasevich Link: http://lkml.kernel.org/r/150724527741.5014.15465541485637899227.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a8fc1492b308..da2ccf142358 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1771,6 +1771,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } +#if 0 int register_jprobes(struct jprobe **jps, int num) { int ret = 0, i; @@ -1839,6 +1840,7 @@ void unregister_jprobes(struct jprobe **jps, int num) } } EXPORT_SYMBOL_GPL(unregister_jprobes); +#endif #ifdef CONFIG_KRETPROBES /* -- cgit v1.2.3 From 2c7d662e2647aa55fa56dc449b3878ac24e17adf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Oct 2017 08:15:17 +0900 Subject: kprobes: Disable the jprobes test code Disable jprobes test code because jprobes are deprecated. This code will be completely removed when the jprobe code is removed. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Ian McDonald Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vlad Yasevich Link: http://lkml.kernel.org/r/150724531730.5014.6377596890962355763.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/test_kprobes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 47106a1e645a..dd53e354f630 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -22,7 +22,7 @@ #define div_factor 3 -static u32 rand1, preh_val, posth_val, jph_val; +static u32 rand1, preh_val, posth_val; static int errors, handler_errors, num_tests; static u32 (*target)(u32 value); static u32 (*target2)(u32 value); @@ -162,6 +162,9 @@ static int test_kprobes(void) } +#if 0 +static u32 jph_val; + static u32 j_kprobe_target(u32 value) { if (preemptible()) { @@ -239,6 +242,10 @@ static int test_jprobes(void) return 0; } +#else +#define test_jprobe() (0) +#define test_jprobes() (0) +#endif #ifdef CONFIG_KRETPROBES static u32 krph_val; -- cgit v1.2.3 From 435bf0d3f99a164df7e8c30428cef266b91d1d3b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 18 Oct 2017 07:10:15 -0700 Subject: bpf: enforce TCP only support for sockmap Only TCP sockets have been tested and at the moment the state change callback only handles TCP sockets. This adds a check to ensure that sockets actually being added are TCP sockets. For net-next we can consider UDP support. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..c68899d5b246 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -840,6 +840,12 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP) { + fput(socket->file); + return -EOPNOTSUPP; + } + err = sock_map_ctx_update_elem(&skops, map, key, flags); fput(socket->file); return err; -- cgit v1.2.3 From 34f79502bbcfab659b8729da68b5e387f96eb4c1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 18 Oct 2017 07:10:36 -0700 Subject: bpf: avoid preempt enable/disable in sockmap using tcp_skb_cb region SK_SKB BPF programs are run from the socket/tcp context but early in the stack before much of the TCP metadata is needed in tcp_skb_cb. So we can use some unused fields to place BPF metadata needed for SK_SKB programs when implementing the redirect function. This allows us to drop the preempt disable logic. It does however require an API change so sk_redirect_map() has been updated to additionally provide ctx_ptr to skb. Note, we do however continue to disable/enable preemption around actual BPF program running to account for map updates. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index c68899d5b246..beaabb21c3a3 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -39,6 +39,7 @@ #include #include #include +#include struct bpf_stab { struct bpf_map map; @@ -101,9 +102,16 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) return SK_DROP; skb_orphan(skb); + /* We need to ensure that BPF metadata for maps is also cleared + * when we orphan the skb so that we don't have the possibility + * to reference a stale map. + */ + TCP_SKB_CB(skb)->bpf.map = NULL; skb->sk = psock->sock; bpf_compute_data_end(skb); + preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); + preempt_enable(); skb->sk = NULL; return rc; @@ -114,17 +122,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) struct sock *sk; int rc; - /* Because we use per cpu values to feed input from sock redirect - * in BPF program to do_sk_redirect_map() call we need to ensure we - * are not preempted. RCU read lock is not sufficient in this case - * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. - */ - preempt_disable(); rc = smap_verdict_func(psock, skb); switch (rc) { case SK_REDIRECT: - sk = do_sk_redirect_map(); - preempt_enable(); + sk = do_sk_redirect_map(skb); if (likely(sk)) { struct smap_psock *peer = smap_psock_sk(sk); @@ -141,8 +142,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) /* Fall through and free skb otherwise */ case SK_DROP: default: - if (rc != SK_REDIRECT) - preempt_enable(); kfree_skb(skb); } } -- cgit v1.2.3 From fb50df8d32283cd95932a182a46a10070c4a8832 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 18 Oct 2017 07:11:22 -0700 Subject: bpf: require CAP_NET_ADMIN when using sockmap maps Restrict sockmap to CAP_NET_ADMIN. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index beaabb21c3a3..2b6eb35ae5d3 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -486,6 +486,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) int err = -EINVAL; u64 cost; + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) -- cgit v1.2.3 From 9ef2a8cd5c0dcb8e1f1534615c56eb13b630c363 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 18 Oct 2017 07:11:44 -0700 Subject: bpf: require CAP_NET_ADMIN when using devmap Devmap is used with XDP which requires CAP_NET_ADMIN so lets also make CAP_NET_ADMIN required to use the map. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 920428d84da2..52e0548ba548 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -78,6 +78,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) int err = -EINVAL; u64 cost; + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) -- cgit v1.2.3 From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:22 -0700 Subject: bpf: Add file mode configuration into bpf maps Introduce the map read/write flags to the eBPF syscalls that returns the map fd. The flags is used to set up the file mode when construct a new file descriptor for bpf maps. To not break the backward capability, the f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise it should be O_RDONLY or O_WRONLY. When the userspace want to modify or read the map content, it will check the file mode to see if it is allowed to make the change. Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 6 +++- kernel/bpf/devmap.c | 5 ++- kernel/bpf/hashtab.c | 5 +-- kernel/bpf/inode.c | 15 ++++++--- kernel/bpf/lpm_trie.c | 3 +- kernel/bpf/sockmap.c | 5 ++- kernel/bpf/stackmap.c | 5 ++- kernel/bpf/syscall.c | 88 +++++++++++++++++++++++++++++++++++++++++++++------ 8 files changed, 110 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 68d866628be0..988c04c91e10 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -19,6 +19,9 @@ #include "map_in_map.h" +#define ARRAY_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE || + attr->value_size == 0 || + attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e093d9a2c4dd..e5d3de7cff2e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -50,6 +50,9 @@ #include #include +#define DEV_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct bpf_dtab_netdev { struct net_device *dev; struct bpf_dtab *dtab; @@ -80,7 +83,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); dtab = kzalloc(sizeof(*dtab), GFP_USER); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..919955236e63 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -18,8 +18,9 @@ #include "bpf_lru_list.h" #include "map_in_map.h" -#define HTAB_CREATE_FLAG_MASK \ - (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE) +#define HTAB_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ + BPF_F_RDONLY | BPF_F_WRONLY) struct bucket { struct hlist_nulls_head head; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be1dde967208..01aaef1a77c5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -295,7 +295,7 @@ out: } static void *bpf_obj_do_get(const struct filename *pathname, - enum bpf_type *type) + enum bpf_type *type, int flags) { struct inode *inode; struct path path; @@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname, return ERR_PTR(ret); inode = d_backing_inode(path.dentry); - ret = inode_permission(inode, MAY_WRITE); + ret = inode_permission(inode, ACC_MODE(flags)); if (ret) goto out; @@ -326,18 +326,23 @@ out: return ERR_PTR(ret); } -int bpf_obj_get_user(const char __user *pathname) +int bpf_obj_get_user(const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; struct filename *pname; int ret = -ENOENT; + int f_flags; void *raw; + f_flags = bpf_get_file_flag(flags); + if (f_flags < 0) + return f_flags; + pname = getname(pathname); if (IS_ERR(pname)) return PTR_ERR(pname); - raw = bpf_obj_do_get(pname, &type); + raw = bpf_obj_do_get(pname, &type, f_flags); if (IS_ERR(raw)) { ret = PTR_ERR(raw); goto out; @@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname) if (type == BPF_TYPE_PROG) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) - ret = bpf_map_new_fd(raw); + ret = bpf_map_new_fd(raw, f_flags); else goto out; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 34d8a690ea05..885e45479680 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -495,7 +495,8 @@ out: #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) -#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE) +#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ + BPF_F_RDONLY | BPF_F_WRONLY) static struct bpf_map *trie_alloc(union bpf_attr *attr) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a298d6666698..86ec846f2d5e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -40,6 +40,9 @@ #include #include +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct bpf_stab { struct bpf_map map; struct sock **sock_map; @@ -489,7 +492,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 135be433e9a0..a15bc636cc98 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -11,6 +11,9 @@ #include #include "percpu_freelist.h" +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; @@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_flags & ~BPF_F_NUMA_NODE) + if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); /* check sanity of attributes */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0e893cac6795..676a06e6b322 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -34,6 +34,8 @@ #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) + DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); @@ -294,17 +296,48 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) } #endif +static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, + loff_t *ppos) +{ + /* We need this handler such that alloc_file() enables + * f_mode with FMODE_CAN_READ. + */ + return -EINVAL; +} + +static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, + size_t siz, loff_t *ppos) +{ + /* We need this handler such that alloc_file() enables + * f_mode with FMODE_CAN_WRITE. + */ + return -EINVAL; +} + static const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, #endif .release = bpf_map_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, }; -int bpf_map_new_fd(struct bpf_map *map) +int bpf_map_new_fd(struct bpf_map *map, int flags) { return anon_inode_getfd("bpf-map", &bpf_map_fops, map, - O_RDWR | O_CLOEXEC); + flags | O_CLOEXEC); +} + +int bpf_get_file_flag(int flags) +{ + if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) + return -EINVAL; + if (flags & BPF_F_RDONLY) + return O_RDONLY; + if (flags & BPF_F_WRONLY) + return O_WRONLY; + return O_RDWR; } /* helper macro to check that unused fields 'union bpf_attr' are zero */ @@ -344,12 +377,17 @@ static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_map *map; + int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); if (err) return -EINVAL; + f_flags = bpf_get_file_flag(attr->map_flags); + if (f_flags < 0) + return f_flags; + if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) @@ -375,7 +413,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map; - err = bpf_map_new_fd(map); + err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. * bpf_map_put() is needed because the above @@ -490,6 +528,11 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -570,6 +613,11 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -659,6 +707,11 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -702,6 +755,11 @@ static int map_get_next_key(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + if (ukey) { key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { @@ -908,6 +966,8 @@ static const struct file_operations bpf_prog_fops = { .show_fdinfo = bpf_prog_show_fdinfo, #endif .release = bpf_prog_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, }; int bpf_prog_new_fd(struct bpf_prog *prog) @@ -1117,11 +1177,11 @@ free_prog_nouncharge: return err; } -#define BPF_OBJ_LAST_FIELD bpf_fd +#define BPF_OBJ_LAST_FIELD file_flags static int bpf_obj_pin(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ)) + if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) return -EINVAL; return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); @@ -1129,10 +1189,12 @@ static int bpf_obj_pin(const union bpf_attr *attr) static int bpf_obj_get(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || + attr->file_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; - return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); + return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), + attr->file_flags); } #ifdef CONFIG_CGROUP_BPF @@ -1392,20 +1454,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) return fd; } -#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags static int bpf_map_get_fd_by_id(const union bpf_attr *attr) { struct bpf_map *map; u32 id = attr->map_id; + int f_flags; int fd; - if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) + if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || + attr->open_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + f_flags = bpf_get_file_flag(attr->open_flags); + if (f_flags < 0) + return f_flags; + spin_lock_bh(&map_idr_lock); map = idr_find(&map_idr, id); if (map) @@ -1417,7 +1485,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - fd = bpf_map_new_fd(map); + fd = bpf_map_new_fd(map, f_flags); if (fd < 0) bpf_map_put(map); -- cgit v1.2.3 From afdb09c720b62b8090584c11151d856df330e57d Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:24 -0700 Subject: security: bpf: Add LSM hooks for bpf object related syscall Introduce several LSM hooks for the syscalls that will allow the userspace to access to eBPF object such as eBPF programs and eBPF maps. The security check is aimed to enforce a per object security protection for eBPF object so only processes with the right priviliges can read/write to a specific map or use a specific eBPF program. Besides that, a general security hook is added before the multiplexer of bpf syscall to check the cmd and the attribute used for the command. The actual security module can decide which command need to be checked and how the cmd should be checked. Signed-off-by: Chenbo Feng Acked-by: James Morris Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 676a06e6b322..5cb56d06b48d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -212,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work) struct bpf_map *map = container_of(work, struct bpf_map, work); bpf_map_uncharge_memlock(map); + security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); } @@ -325,6 +326,12 @@ static const struct file_operations bpf_map_fops = { int bpf_map_new_fd(struct bpf_map *map, int flags) { + int ret; + + ret = security_bpf_map(map, OPEN_FMODE(flags)); + if (ret < 0) + return ret; + return anon_inode_getfd("bpf-map", &bpf_map_fops, map, flags | O_CLOEXEC); } @@ -405,10 +412,14 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - err = bpf_map_charge_memlock(map); + err = security_bpf_map_alloc(map); if (err) goto free_map_nouncharge; + err = bpf_map_charge_memlock(map); + if (err) + goto free_map_sec; + err = bpf_map_alloc_id(map); if (err) goto free_map; @@ -430,6 +441,8 @@ static int map_create(union bpf_attr *attr) free_map: bpf_map_uncharge_memlock(map); +free_map_sec: + security_bpf_map_free(map); free_map_nouncharge: map->ops->map_free(map); return err; @@ -914,6 +927,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); + security_bpf_prog_free(aux); bpf_prog_free(aux->prog); } @@ -972,6 +986,12 @@ static const struct file_operations bpf_prog_fops = { int bpf_prog_new_fd(struct bpf_prog *prog) { + int ret; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ret; + return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); } @@ -1111,10 +1131,14 @@ static int bpf_prog_load(union bpf_attr *attr) if (!prog) return -ENOMEM; - err = bpf_prog_charge_memlock(prog); + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; + err = bpf_prog_charge_memlock(prog); + if (err) + goto free_prog_sec; + prog->len = attr->insn_cnt; err = -EFAULT; @@ -1172,6 +1196,8 @@ free_used_maps: free_used_maps(prog->aux); free_prog: bpf_prog_uncharge_memlock(prog); +free_prog_sec: + security_bpf_prog_free(prog->aux); free_prog_nouncharge: bpf_prog_free(prog); return err; @@ -1640,6 +1666,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; + err = security_bpf(cmd, &attr, size); + if (err < 0) + return err; + switch (cmd) { case BPF_MAP_CREATE: err = map_create(&attr); -- cgit v1.2.3 From f66e448cfda021b0bcd884f26709796fe19c7cc1 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:26 -0700 Subject: selinux: bpf: Add addtional check for bpf object file receive Introduce a bpf object related check when sending and receiving files through unix domain socket as well as binder. It checks if the receiving process have privilege to read/write the bpf map or use the bpf program. This check is necessary because the bpf maps and programs are using a anonymous inode as their shared inode so the normal way of checking the files and sockets when passing between processes cannot work properly on eBPF object. This check only works when the BPF_SYSCALL is configured. Signed-off-by: Chenbo Feng Acked-by: Stephen Smalley Reviewed-by: James Morris Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5cb56d06b48d..323be2473c4b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -315,7 +315,7 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, return -EINVAL; } -static const struct file_operations bpf_map_fops = { +const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, #endif @@ -975,7 +975,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) } #endif -static const struct file_operations bpf_prog_fops = { +const struct file_operations bpf_prog_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_prog_show_fdinfo, #endif -- cgit v1.2.3 From e4d0b679a8467b6d0c169642f0b5f57d8d6eacc2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Sep 2017 14:30:06 -0700 Subject: srcu: Add parameters to SRCU docbook comments Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 729a8706751d..6d5880089ff6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, /** * call_srcu() - Queue a callback for invocation after an SRCU grace period * @sp: srcu_struct in queue the callback - * @head: structure to be used for queueing the SRCU callback. + * @rhp: structure to be used for queueing the SRCU callback. * @func: function to be invoked after the SRCU grace period * * The callback function will be invoked some time after a full SRCU -- cgit v1.2.3 From 1c9fec470b81ca5e89391c20a11ead31a1e9314b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 20 Oct 2017 07:36:05 -0700 Subject: waitid(): Avoid unbalanced user_access_end() on access_ok() error As pointed out by Linus and David, the earlier waitid() fix resulted in a (currently harmless) unbalanced user_access_end() call. This fixes it to just directly return EFAULT on access_ok() failure. Fixes: 96ca579a1ecc ("waitid(): Add missing access_ok() checks") Acked-by: David Daney Cc: Al Viro Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index cf28528842bc..f6cad39f35df 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1611,7 +1611,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, return err; if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) - goto Efault; + return -EFAULT; user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1739,7 +1739,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, return err; if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) - goto Efault; + return -EFAULT; user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); -- cgit v1.2.3 From 1f7c70d6b2bc5de301f30456621e1161fddf4242 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 21 Oct 2017 16:06:52 +0200 Subject: cpu/hotplug: Reset node state after operation The recent rework of the cpu hotplug internals changed the usage of the per cpu state->node field, but missed to clean it up after usage. So subsequent hotplug operations use the stale pointer from a previous operation and hand it into the callback functions. The callbacks then dereference a pointer which either belongs to a different facility or points to freed and potentially reused memory. In either case data corruption and crashes are the obvious consequence. Reset the node and the last pointers in the per cpu state to NULL after the operation which set them has completed. Fixes: 96abb968549c ("smp/hotplug: Allow external multi-instance rollback") Reported-by: Tvrtko Ursulin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Boris Ostrovsky Cc: "Paul E. McKenney" Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710211606130.3213@nanos --- kernel/cpu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d851df22f5c5..04892a82f6ac 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -632,6 +632,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, __cpuhp_kick_ap(st); } + /* + * Clean up the leftovers so the next hotplug operation wont use stale + * data. + */ + st->node = st->last = NULL; return ret; } -- cgit v1.2.3 From b5149873a0c299195b5346fe4dc2c5b04ae2f995 Mon Sep 17 00:00:00 2001 From: Tal Shorer Date: Sat, 21 Oct 2017 19:29:24 +0300 Subject: workqueue: respect isolated cpus when queueing an unbound work Initialize wq_unbound_cpumask to exclude cpus that were isolated by the cmdline's isolcpus parameter. Signed-off-by: Tal Shorer Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..bfa433b38a61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4980,6 +4980,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5579,7 +5583,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.2.3 From 8695a5395661fbb4a4f26c97f801f3800ae4754e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 19 Oct 2017 09:03:52 -0700 Subject: bpf: devmap fix arithmetic overflow in bitmap_size calculation An integer overflow is possible in dev_map_bitmap_size() when calculating the BITS_TO_LONG logic which becomes, after macro replacement, (((n) + (d) - 1)/ (d)) where 'n' is a __u32 and 'd' is (8 * sizeof(long)). To avoid overflow cast to u64 before arithmetic. Reported-by: Richard Weinberger Acked-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 52e0548ba548..e745d6a88224 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -69,7 +69,7 @@ static LIST_HEAD(dev_map_list); static u64 dev_map_bitmap_size(const union bpf_attr *attr) { - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); + return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) -- cgit v1.2.3 From fb2a311a31d3457fe8c3ee16f5609877e2ead9f7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 21 Oct 2017 02:34:21 +0200 Subject: bpf: fix off by one for range markings with L{T, E} patterns During review I noticed that the current logic for direct packet access marking in check_cond_jmp_op() has an off by one for the upper right range border when marking in find_good_pkt_pointers() with BPF_JLT and BPF_JLE. It's not really harmful given access up to pkt_end is always safe, but we should nevertheless correct the range marking before it becomes ABI. If pkt_data' denotes a pkt_data derived pointer (pkt_data + X), then for pkt_data' < pkt_end in the true branch as well as for pkt_end <= pkt_data' in the false branch we mark the range with X although it should really be X - 1 in these cases. For example, X could be pkt_end - pkt_data, then when testing for pkt_data' < pkt_end the verifier simulation cannot deduce that a byte load of pkt_data' - 1 would succeed in this branch. Fixes: b4e432f1000a ("bpf: enable BPF_J{LT, LE, SLT, SLE} opcodes in verifier") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 20f3889c006e..49cb5ad14746 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2430,12 +2430,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } static void find_good_pkt_pointers(struct bpf_verifier_state *state, - struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg, + bool range_right_open) { struct bpf_reg_state *regs = state->regs, *reg; + u16 new_range; int i; - if (dst_reg->off < 0) + if (dst_reg->off < 0 || + (dst_reg->off == 0 && range_right_open)) /* This doesn't give us any range */ return; @@ -2446,9 +2449,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, */ return; - /* LLVM can generate four kind of checks: + new_range = dst_reg->off; + if (range_right_open) + new_range--; + + /* Examples for register markings: * - * Type 1/2: + * pkt_data in dst register: * * r2 = r3; * r2 += 8; @@ -2465,7 +2472,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * r2=pkt(id=n,off=8,r=0) * r3=pkt(id=n,off=0,r=0) * - * Type 3/4: + * pkt_data in src register: * * r2 = r3; * r2 += 8; @@ -2483,7 +2490,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * r3=pkt(id=n,off=0,r=0) * * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) - * so that range of bytes [r3, r3 + 8) is safe to access. + * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8) + * and [r3, r3 + 8-1) respectively is safe to access depending on + * the check. */ /* If our ids match, then we must have the same max_value. And we @@ -2494,14 +2503,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ - regs[i].range = max_t(u16, regs[i].range, dst_reg->off); + regs[i].range = max(regs[i].range, new_range); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) - reg->range = max_t(u16, reg->range, dst_reg->off); + reg->range = max(reg->range, new_range); } } @@ -2865,19 +2874,19 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(this_branch, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg, false); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(other_branch, dst_reg); + find_good_pkt_pointers(other_branch, dst_reg, true); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], false); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], true); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; -- cgit v1.2.3 From 0fd4759c5515b7f2297d7fee5c45e5d9dd733001 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 21 Oct 2017 02:34:22 +0200 Subject: bpf: fix pattern matches for direct packet access Alexander had a test program with direct packet access, where the access test was in the form of data + X > data_end. In an unrelated change to the program LLVM decided to swap the branches and emitted code for the test in form of data + X <= data_end. We hadn't seen these being generated previously, thus verifier would reject the program. Therefore, fix up the verifier to detect all test cases, so we don't run into such issues in the future. Fixes: b4e432f1000a ("bpf: enable BPF_J{LT, LE, SLT, SLE} opcodes in verifier") Reported-by: Alexander Alemayhu Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 49cb5ad14746..c48ca2a34b5e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2874,18 +2874,42 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' > pkt_end */ find_good_pkt_pointers(this_branch, dst_reg, false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end > pkt_data' */ + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], true); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' < pkt_end */ find_good_pkt_pointers(other_branch, dst_reg, true); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end < pkt_data' */ + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' >= pkt_end */ + find_good_pkt_pointers(this_branch, dst_reg, true); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end >= pkt_data' */ find_good_pkt_pointers(other_branch, ®s[insn->src_reg], false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' <= pkt_end */ + find_good_pkt_pointers(other_branch, dst_reg, false); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end <= pkt_data' */ find_good_pkt_pointers(this_branch, ®s[insn->src_reg], true); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); -- cgit v1.2.3 From 31749468c3f9d77927ed3144259dc208e6625ede Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 23 Oct 2017 19:39:28 +0200 Subject: bpf: cpumap fix potential lost wake-up problem As pointed out by Michael, commit 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation") contains a classical example of the potential lost wake-up problem. We need to recheck the condition __ptr_ring_empty() after changing current->state to TASK_INTERRUPTIBLE, this avoids a race between wake_up_process() and schedule(). After this, a race with wake_up_process() will simply change the state to TASK_RUNNING, and the schedule() call not really put us to sleep. Fixes: 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation") Reported-by: "Michael S. Tsirkin" Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b4358d84ddf1..86e29cbf7827 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -288,13 +288,17 @@ static int cpu_map_kthread_run(void *data) /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); - sched = 1; + set_current_state(TASK_INTERRUPTIBLE); + /* Recheck to avoid lost wake-up */ + if (__ptr_ring_empty(rcpu->queue)) { + schedule(); + sched = 1; + } else { + __set_current_state(TASK_RUNNING); + } } else { sched = cond_resched(); } - __set_current_state(TASK_RUNNING); /* Process packets in rcpu->queue */ local_bh_disable(); -- cgit v1.2.3 From e22cdc3fc5991956146b9856d36b4971fe54dcd6 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Mon, 23 Oct 2017 19:01:54 +0600 Subject: sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK cpulist_parse() uses nr_cpumask_bits as a limit to parse the passed buffer from kernel commandline. What nr_cpumask_bits represents varies depending upon the CONFIG_CPUMASK_OFFSTACK option: - If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is the same as NR_CPUS, which might not represent the # of CPUs that really exist (default 64). So, there's a chance of a gap between nr_cpu_ids and NR_CPUS, which ultimately lead towards invalid cpulist_parse() operation. For example, if isolcpus=9 is passed on an 8 cpu system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error that it's supposed to. This patch fixes this bug by finding the last CPU of the passed isolcpus= list and checking it against nr_cpu_ids. It also fixes the error message where the nr_cpu_ids should be nr_cpu_ids-1, since CPU numbering starts from 0. Signed-off-by: Rakib Mullick Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adobriyan@gmail.com Cc: akpm@linux-foundation.org Cc: longman@redhat.com Cc: mka@chromium.org Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20171023130154.9050-1-rakib.mullick@gmail.com [ Enhanced the changelog and the kernel message. ] Signed-off-by: Ingo Molnar include/linux/cpumask.h | 16 ++++++++++++++++ kernel/sched/topology.c | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) --- kernel/sched/topology.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e50450c2fed8..e3d31b0880dc 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -476,8 +476,8 @@ static int __init isolated_cpu_setup(char *str) alloc_bootmem_cpumask_var(&cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map); - if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); + if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) { + pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1); return 0; } return 1; -- cgit v1.2.3 From 506458efaf153c1ea480591c5602a5a3ba5a3b76 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:48 +0100 Subject: locking/barriers: Convert users of lockless_dereference() to READ_ONCE() READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it can be used instead of lockless_dereference() without any change in semantics. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- kernel/seccomp.c | 2 +- kernel/task_work.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9d93db81fa36..824a583079a1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4231,7 +4231,7 @@ static void perf_remove_from_owner(struct perf_event *event) * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - owner = lockless_dereference(event->owner); + owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -4328,7 +4328,7 @@ again: * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ - ctx = lockless_dereference(child->ctx); + ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ae832e13b97..8ac79355915b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -189,7 +189,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = - lockless_dereference(current->seccomp.filter); + READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) diff --git a/kernel/task_work.c b/kernel/task_work.c index 836a72a66fba..9a9f262fc53d 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -67,7 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = lockless_dereference(*pprev))) { + while ((work = READ_ONCE(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) -- cgit v1.2.3 From 0b4c6841fee03e096b735074a0c4aab3a8e92986 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 23 Oct 2017 23:53:07 -0700 Subject: bpf: use the same condition in perf event set/free bpf handler This is a cleanup such that doing the same check in perf_event_free_bpf_prog as we already do in perf_event_set_bpf_prog step. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 31ee304a5844..9f78a6825bbe 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8191,10 +8191,10 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; - perf_event_free_bpf_handler(event); - - if (!event->tp_event) + if (event->attr.type != PERF_TYPE_TRACEPOINT) { + perf_event_free_bpf_handler(event); return; + } prog = event->tp_event->prog; if (prog && event->tp_event->bpf_prog_owner == event) { -- cgit v1.2.3 From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 23 Oct 2017 23:53:08 -0700 Subject: bpf: permit multiple bpf attachments for a single perf event This patch enables multiple bpf attachments for a kprobe/uprobe/tracepoint single trace event. Each trace_event keeps a list of attached perf events. When an event happens, all attached bpf programs will be executed based on the order of attachment. A global bpf_event_mutex lock is introduced to protect prog_array attaching and detaching. An alternative will be introduce a mutex lock in every trace_event_call structure, but it takes a lot of extra memory. So a global bpf_event_mutex lock is a good compromise. The bpf prog detachment involves allocation of memory. If the allocation fails, a dummy do-nothing program will replace to-be-detached program in-place. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/core.c | 81 ++++++++++++++++++++++++++++++++++++++++++ kernel/events/core.c | 26 +++++--------- kernel/trace/bpf_trace.c | 82 ++++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace_kprobe.c | 6 ++-- kernel/trace/trace_syscalls.c | 34 ++++++++++-------- kernel/trace/trace_uprobe.c | 3 +- 6 files changed, 188 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8e7c8bf2b687..7fe448799d76 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1394,6 +1394,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); +static unsigned int __bpf_prog_ret1(const void *ctx, + const struct bpf_insn *insn) +{ + return 1; +} + +static struct bpf_prog_dummy { + struct bpf_prog prog; +} dummy_bpf_prog = { + .prog = { + .bpf_func = __bpf_prog_ret1, + }, +}; + /* to avoid allocating empty bpf_prog_array for cgroups that * don't have bpf program attached use one global 'empty_prog_array' * It will not be modified the caller of bpf_prog_array_alloc() @@ -1463,6 +1477,73 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return 0; } +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, + struct bpf_prog *old_prog) +{ + struct bpf_prog **prog = progs->progs; + + for (; *prog; prog++) + if (*prog == old_prog) { + WRITE_ONCE(*prog, &dummy_bpf_prog.prog); + break; + } +} + +int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, + struct bpf_prog *exclude_prog, + struct bpf_prog *include_prog, + struct bpf_prog_array **new_array) +{ + int new_prog_cnt, carry_prog_cnt = 0; + struct bpf_prog **existing_prog; + struct bpf_prog_array *array; + int new_prog_idx = 0; + + /* Figure out how many existing progs we need to carry over to + * the new array. + */ + if (old_array) { + existing_prog = old_array->progs; + for (; *existing_prog; existing_prog++) { + if (*existing_prog != exclude_prog && + *existing_prog != &dummy_bpf_prog.prog) + carry_prog_cnt++; + if (*existing_prog == include_prog) + return -EEXIST; + } + } + + /* How many progs (not NULL) will be in the new array? */ + new_prog_cnt = carry_prog_cnt; + if (include_prog) + new_prog_cnt += 1; + + /* Do we have any prog (not NULL) in the new array? */ + if (!new_prog_cnt) { + *new_array = NULL; + return 0; + } + + /* +1 as the end of prog_array is marked with NULL */ + array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); + if (!array) + return -ENOMEM; + + /* Fill in the new prog array */ + if (carry_prog_cnt) { + existing_prog = old_array->progs; + for (; *existing_prog; existing_prog++) + if (*existing_prog != exclude_prog && + *existing_prog != &dummy_bpf_prog.prog) + array->progs[new_prog_idx++] = *existing_prog; + } + if (include_prog) + array->progs[new_prog_idx++] = include_prog; + array->progs[new_prog_idx] = NULL; + *new_array = array; + return 0; +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9f78a6825bbe..9660ee65fbef 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7954,11 +7954,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { - struct bpf_prog *prog = call->prog; - - if (prog) { + if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; - if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { + if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } @@ -8147,13 +8145,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; + int ret; if (event->attr.type != PERF_TYPE_TRACEPOINT) return perf_event_set_bpf_handler(event, prog_fd); - if (event->tp_event->prog) - return -EEXIST; - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); @@ -8181,26 +8177,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EACCES; } } - event->tp_event->prog = prog; - event->tp_event->bpf_prog_owner = event; - return 0; + ret = perf_event_attach_bpf_prog(event, prog); + if (ret) + bpf_prog_put(prog); + return ret; } static void perf_event_free_bpf_prog(struct perf_event *event) { - struct bpf_prog *prog; - if (event->attr.type != PERF_TYPE_TRACEPOINT) { perf_event_free_bpf_handler(event); return; } - - prog = event->tp_event->prog; - if (prog && event->tp_event->bpf_prog_owner == event) { - event->tp_event->prog = NULL; - bpf_prog_put(prog); - } + perf_event_detach_bpf_prog(event); } #else diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3126da2f468a..b65011d320e3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,7 +17,7 @@ /** * trace_call_bpf - invoke BPF program - * @prog: BPF program + * @call: tracepoint event * @ctx: opaque context pointer * * kprobe handlers execute BPF programs via this helper. @@ -29,7 +29,7 @@ * 1 - store kprobe event into ring buffer * Other values are reserved and currently alias to 1 */ -unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; @@ -49,9 +49,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) goto out; } - rcu_read_lock(); - ret = BPF_PROG_RUN(prog, ctx); - rcu_read_unlock(); + /* + * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock + * to all call sites, we did a bpf_prog_array_valid() there to check + * whether call->prog_array is empty or not, which is + * a heurisitc to speed up execution. + * + * If bpf_prog_array_valid() fetched prog_array was + * non-NULL, we go into trace_call_bpf() and do the actual + * proper rcu_dereference() under RCU lock. + * If it turns out that prog_array is NULL then, we bail out. + * For the opposite, if the bpf_prog_array_valid() fetched pointer + * was NULL, you'll skip the prog_array with the risk of missing + * out of events when it was updated in between this and the + * rcu_dereference() which is accepted risk. + */ + ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN); out: __this_cpu_dec(bpf_prog_active); @@ -741,3 +754,62 @@ const struct bpf_verifier_ops perf_event_verifier_ops = { const struct bpf_prog_ops perf_event_prog_ops = { }; + +static DEFINE_MUTEX(bpf_event_mutex); + +int perf_event_attach_bpf_prog(struct perf_event *event, + struct bpf_prog *prog) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + int ret = -EEXIST; + + mutex_lock(&bpf_event_mutex); + + if (event->prog) + goto out; + + old_array = rcu_dereference_protected(event->tp_event->prog_array, + lockdep_is_held(&bpf_event_mutex)); + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + if (ret < 0) + goto out; + + /* set the new array to event->tp_event and set event->prog */ + event->prog = prog; + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free(old_array); + +out: + mutex_unlock(&bpf_event_mutex); + return ret; +} + +void perf_event_detach_bpf_prog(struct perf_event *event) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + int ret; + + mutex_lock(&bpf_event_mutex); + + if (!event->prog) + goto out; + + old_array = rcu_dereference_protected(event->tp_event->prog_array, + lockdep_is_held(&bpf_event_mutex)); + + ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + if (ret < 0) { + bpf_prog_array_delete_safe(old_array, event->prog); + } else { + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free(old_array); + } + + bpf_prog_put(event->prog); + event->prog = NULL; + +out: + mutex_unlock(&bpf_event_mutex); +} diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8a907e12b6b9..abf92e478cfb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1174,13 +1174,12 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; - struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; head = this_cpu_ptr(call->perf_events); @@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; - struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; head = this_cpu_ptr(call->perf_events); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 696afe72d3b1..71a6af34d7a9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -559,9 +559,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, - struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) { +static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_metadata *sys_data, + struct syscall_trace_enter *rec) +{ struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; @@ -573,7 +574,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, param.syscall_nr = rec->nr; for (i = 0; i < sys_data->nb_args; i++) param.args[i] = rec->args[i]; - return trace_call_bpf(prog, ¶m); + return trace_call_bpf(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) @@ -581,7 +582,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; - struct bpf_prog *prog; + bool valid_prog_array; int syscall_nr; int rctx; int size; @@ -596,9 +597,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; - prog = READ_ONCE(sys_data->enter_event->prog); head = this_cpu_ptr(sys_data->enter_event->perf_events); - if (!prog && hlist_empty(head)) + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (!valid_prog_array && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ @@ -614,7 +615,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) || + if ((valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; @@ -659,8 +661,9 @@ static void perf_sysenter_disable(struct trace_event_call *call) mutex_unlock(&syscall_trace_lock); } -static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, - struct syscall_trace_exit *rec) { +static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_trace_exit *rec) +{ struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; @@ -670,7 +673,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; param.ret = rec->ret; - return trace_call_bpf(prog, ¶m); + return trace_call_bpf(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) @@ -678,7 +681,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; struct hlist_head *head; - struct bpf_prog *prog; + bool valid_prog_array; int syscall_nr; int rctx; int size; @@ -693,9 +696,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - prog = READ_ONCE(sys_data->exit_event->prog); head = this_cpu_ptr(sys_data->exit_event->perf_events); - if (!prog && hlist_empty(head)) + valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); + if (!valid_prog_array && hlist_empty(head)) return; /* We can probably do that at build time */ @@ -709,7 +712,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((prog && !perf_call_bpf_exit(prog, regs, rec)) || + if ((valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4525e0271a53..153c0e411461 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; - struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); -- cgit v1.2.3 From e0d02285f16e8d5810f3d5d5e8a5886ca0015d3b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Oct 2017 13:20:47 +0100 Subject: locking/qrwlock: Use 'struct qrwlock' instead of 'struct __qrwlock' There's no good reason to keep the internal structure of struct qrwlock hidden from qrwlock.h, particularly as it's actually needed for unlock and ends up being abstracted independently behind the __qrwlock_write_byte() function. Stop pretending we can hide this stuff, and move the __qrwlock definition into qrwlock, removing the __qrwlock_write_byte() nastiness and using the same struct definition everywhere instead. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Boqun Feng Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Waiman Long Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 2655f26ec882..1af791e37348 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -23,26 +23,6 @@ #include #include -/* - * This internal data structure is used for optimizing access to some of - * the subfields within the atomic_t cnts. - */ -struct __qrwlock { - union { - atomic_t cnts; - struct { -#ifdef __LITTLE_ENDIAN - u8 wmode; /* Writer mode */ - u8 rcnts[3]; /* Reader counts */ -#else - u8 rcnts[3]; /* Reader counts */ - u8 wmode; /* Writer mode */ -#endif - }; - }; - arch_spinlock_t lock; -}; - /** * rspin_until_writer_unlock - inc reader count & spin until writer is gone * @lock : Pointer to queue rwlock structure @@ -125,10 +105,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock) * or wait for a previous writer to go away. */ for (;;) { - struct __qrwlock *l = (struct __qrwlock *)lock; - - if (!READ_ONCE(l->wmode) && - (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) + if (!READ_ONCE(lock->wmode) && + (cmpxchg_relaxed(&lock->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax(); -- cgit v1.2.3 From b519b56e378ee82caf9b079b04f5db87dedc3251 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Oct 2017 13:20:49 +0100 Subject: locking/qrwlock: Use atomic_cond_read_acquire() when spinning in qrwlock The qrwlock slowpaths involve spinning when either a prospective reader is waiting for a concurrent writer to drain, or a prospective writer is waiting for concurrent readers to drain. In both of these situations, atomic_cond_read_acquire() can be used to avoid busy-waiting and make use of any backoff functionality provided by the architecture. This patch replaces the open-code loops and rspin_until_writer_unlock() implementation with atomic_cond_read_acquire(). The write mode transition zero to _QW_WAITING is left alone, since (a) this doesn't need acquire semantics and (b) should be fast. Tested-by: Waiman Long Tested-by: Jeremy Linton Tested-by: Adam Wallis Tested-by: Jan Glauber Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Boqun Feng Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 50 ++++++++++++------------------------------------ 1 file changed, 12 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 1af791e37348..5825e0fc1a8e 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -23,29 +23,11 @@ #include #include -/** - * rspin_until_writer_unlock - inc reader count & spin until writer is gone - * @lock : Pointer to queue rwlock structure - * @writer: Current queue rwlock writer status byte - * - * In interrupt context or at the head of the queue, the reader will just - * increment the reader count & wait until the writer releases the lock. - */ -static __always_inline void -rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) -{ - while ((cnts & _QW_WMASK) == _QW_LOCKED) { - cpu_relax(); - cnts = atomic_read_acquire(&lock->cnts); - } -} - /** * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure - * @cnts: Current qrwlock lock value */ -void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) +void queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting @@ -53,13 +35,12 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) if (unlikely(in_interrupt())) { /* * Readers in interrupt context will get the lock immediately - * if the writer is just waiting (not holding the lock yet). - * The rspin_until_writer_unlock() function returns immediately - * in this case. Otherwise, they will spin (with ACQUIRE - * semantics) until the lock is available without waiting in - * the queue. + * if the writer is just waiting (not holding the lock yet), + * so spin with ACQUIRE semantics until the lock is available + * without waiting in the queue. */ - rspin_until_writer_unlock(lock, cnts); + atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) + != _QW_LOCKED); return; } atomic_sub(_QR_BIAS, &lock->cnts); @@ -68,14 +49,14 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * Put the reader into the wait queue */ arch_spin_lock(&lock->wait_lock); + atomic_add(_QR_BIAS, &lock->cnts); /* * The ACQUIRE semantics of the following spinning code ensure * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); - rspin_until_writer_unlock(lock, cnts); + atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) != _QW_LOCKED); /* * Signal the next one in queue to become queue head @@ -90,8 +71,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); */ void queued_write_lock_slowpath(struct qrwlock *lock) { - u32 cnts; - /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); @@ -113,15 +92,10 @@ void queued_write_lock_slowpath(struct qrwlock *lock) } /* When no more readers, set the locked flag */ - for (;;) { - cnts = atomic_read(&lock->cnts); - if ((cnts == _QW_WAITING) && - (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) - break; - - cpu_relax(); - } + do { + atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING); + } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, + _QW_LOCKED) != _QW_WAITING); unlock: arch_spin_unlock(&lock->wait_lock); } -- cgit v1.2.3 From d133166146333e1f13fc81c0e6c43c8d99290a8a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Oct 2017 13:20:51 +0100 Subject: locking/qrwlock: Prevent slowpath writers getting held up by fastpath When a prospective writer takes the qrwlock locking slowpath due to the lock being held, it attempts to cmpxchg the wmode field from 0 to _QW_WAITING so that concurrent lockers also take the slowpath and queue on the spinlock accordingly, allowing the lockers to drain. Unfortunately, this isn't fair, because a fastpath writer that comes in after the lock is made available but before the _QW_WAITING flag is set can effectively jump the queue. If there is a steady stream of prospective writers, then the waiter will be held off indefinitely. This patch restores fairness by separating _QW_WAITING and _QW_LOCKED into two distinct fields: _QW_LOCKED continues to occupy the bottom byte of the lockword so that it can be cleared unconditionally when unlocking, but _QW_WAITING now occupies what used to be the bottom bit of the reader count. This then forces the slow-path for concurrent lockers. Tested-by: Waiman Long Tested-by: Jeremy Linton Tested-by: Adam Wallis Tested-by: Jan Glauber Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Boqun Feng Cc: Jeremy.Linton@arm.com Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1507810851-306-6-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 5825e0fc1a8e..c7471c3fb798 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -39,8 +39,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock) * so spin with ACQUIRE semantics until the lock is available * without waiting in the queue. */ - atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) - != _QW_LOCKED); + atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); return; } atomic_sub(_QR_BIAS, &lock->cnts); @@ -56,7 +55,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock) * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) != _QW_LOCKED); + atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); /* * Signal the next one in queue to become queue head @@ -79,19 +78,10 @@ void queued_write_lock_slowpath(struct qrwlock *lock) (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) goto unlock; - /* - * Set the waiting flag to notify readers that a writer is pending, - * or wait for a previous writer to go away. - */ - for (;;) { - if (!READ_ONCE(lock->wmode) && - (cmpxchg_relaxed(&lock->wmode, 0, _QW_WAITING) == 0)) - break; - - cpu_relax(); - } + /* Set the waiting flag to notify readers that a writer is pending */ + atomic_add(_QW_WAITING, &lock->cnts); - /* When no more readers, set the locked flag */ + /* When no more readers or writers, set the locked flag */ do { atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING); } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, -- cgit v1.2.3 From c95491ed6d6a958743d82bea1d053819988da418 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:22 -0700 Subject: locking/atomics, workqueue: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts the workqueue code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Acked-by: Tejun Heo Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-12-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..39831b2f3c5f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4647,7 +4647,7 @@ static void rebind_workers(struct worker_pool *pool) * concurrency management. Note that when or whether * @worker clears REBOUND doesn't affect correctness. * - * ACCESS_ONCE() is necessary because @worker->flags may be + * WRITE_ONCE() is necessary because @worker->flags may be * tested without holding any lock in * wq_worker_waking_up(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency @@ -4656,7 +4656,7 @@ static void rebind_workers(struct worker_pool *pool) WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); worker_flags |= WORKER_REBOUND; worker_flags &= ~WORKER_UNBOUND; - ACCESS_ONCE(worker->flags) = worker_flags; + WRITE_ONCE(worker->flags, worker_flags); } spin_unlock_irq(&pool->lock); -- cgit v1.2.3 From 6aa7de059173a986114ac43b8f50b297a86f09a8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:29 -0700 Subject: locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/acct.c | 4 ++-- kernel/events/core.c | 6 +++--- kernel/events/ring_buffer.c | 2 +- kernel/exit.c | 2 +- kernel/trace/ring_buffer.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_stack.c | 2 +- kernel/user_namespace.c | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 5e72af29ab73..21eedd0dd81a 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -146,7 +146,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) again: smp_rmb(); rcu_read_lock(); - res = to_acct(ACCESS_ONCE(ns->bacct)); + res = to_acct(READ_ONCE(ns->bacct)); if (!res) { rcu_read_unlock(); return NULL; @@ -158,7 +158,7 @@ again: } rcu_read_unlock(); mutex_lock(&res->lock); - if (res != to_acct(ACCESS_ONCE(ns->bacct))) { + if (res != to_acct(READ_ONCE(ns->bacct))) { mutex_unlock(&res->lock); acct_put(res); goto again; diff --git a/kernel/events/core.c b/kernel/events/core.c index 824a583079a1..8fd2f2d1358a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1200,7 +1200,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); - ctx = ACCESS_ONCE(event->ctx); + ctx = READ_ONCE(event->ctx); if (!atomic_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; @@ -5302,8 +5302,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto aux_unlock; - aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); - aux_size = ACCESS_ONCE(rb->user_page->aux_size); + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) goto aux_unlock; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index f684d8e5fa2b..f3e37971c842 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -381,7 +381,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * (B) <-> (C) ordering is still observed by the pmu driver. */ if (!rb->aux_overwrite) { - aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + aux_tail = READ_ONCE(rb->user_page->aux_tail); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); diff --git a/kernel/exit.c b/kernel/exit.c index f6cad39f35df..6b4298a41167 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1339,7 +1339,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ - int exit_state = ACCESS_ONCE(p->exit_state); + int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 81279c6602ff..845f3805c73d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2724,7 +2724,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, * if it happened, we have to fail the write. */ barrier(); - if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) { local_dec(&cpu_buffer->committing); local_dec(&cpu_buffer->commits); return NULL; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 652c682707cd..9050c8b3ccde 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1459,7 +1459,7 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr, static inline void *event_file_data(struct file *filp) { - return ACCESS_ONCE(file_inode(filp)->i_private); + return READ_ONCE(file_inode(filp)->i_private); } extern struct mutex event_mutex; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 49cb41412eec..780262210c9a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -77,7 +77,7 @@ check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; - int frame_size = ACCESS_ONCE(tracer_frame); + int frame_size = READ_ONCE(tracer_frame); int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..d32b45662fb6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -894,7 +894,7 @@ static bool new_idmap_permitted(const struct file *file, int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; - unsigned long userns_flags = ACCESS_ONCE(ns->flags); + unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? -- cgit v1.2.3 From d141babe4244945f1d001118578e0eb3ce12729d Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:56:00 +0900 Subject: locking/lockdep: Add a boot parameter allowing unwind in cross-release and disable it by default Johan Hovold reported a heavy performance regression caused by lockdep cross-release: > Boot time (from "Linux version" to login prompt) had in fact doubled > since 4.13 where it took 17 seconds (with my current config) compared to > the 35 seconds I now see with 4.14-rc4. > > I quick bisect pointed to lockdep and specifically the following commit: > > 28a903f63ec0 ("locking/lockdep: Handle non(or multi)-acquisition > of a crosslock") > > which I've verified is the commit which doubled the boot time (compared > to 28a903f63ec0^) (added by lockdep crossrelease series [1]). Currently cross-release performs unwind on every acquisition, but that is very expensive. This patch makes unwind optional and disables it by default and only records acquire_ip. Full stack traces are sometimes required for full analysis, in which case a boot paramter, crossrelease_fullstack, can be specified. On my qemu Ubuntu machine (x86_64, 4 cores, 512M), the regression was fixed. We measure boot times with 'perf stat --null --repeat 10 $QEMU', where $QEMU launches a kernel with init=/bin/true: 1. No lockdep enabled: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 2.756558155 seconds time elapsed ( +- 0.09% ) 2. Lockdep enabled: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 2.968710420 seconds time elapsed ( +- 0.12% ) 3. Lockdep enabled + cross-release enabled: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 3.153839636 seconds time elapsed ( +- 0.31% ) 4. Lockdep enabled + cross-release enabled + this patch applied: Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs): 2.963669551 seconds time elapsed ( +- 0.11% ) I.e. lockdep cross-release performance is now indistinguishable from vanilla lockdep. Bisected-by: Johan Hovold Analyzed-by: Thomas Gleixner Suggested-by: Thomas Gleixner Reported-by: Johan Hovold Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-5-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e36e652d996f..160b5d6df7cb 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -76,6 +76,15 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +static int crossrelease_fullstack; +static int __init allow_crossrelease_fullstack(char *str) +{ + crossrelease_fullstack = 1; + return 0; +} + +early_param("crossrelease_fullstack", allow_crossrelease_fullstack); + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -4863,8 +4872,14 @@ static void add_xhlock(struct held_lock *hlock) xhlock->trace.nr_entries = 0; xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; xhlock->trace.entries = xhlock->trace_entries; - xhlock->trace.skip = 3; - save_stack_trace(&xhlock->trace); + + if (crossrelease_fullstack) { + xhlock->trace.skip = 3; + save_stack_trace(&xhlock->trace); + } else { + xhlock->trace.nr_entries = 1; + xhlock->trace.entries[0] = hlock->acquire_ip; + } } static inline int same_context_xhlock(struct hist_lock *xhlock) -- cgit v1.2.3 From e121d64e16484d4a5eba94cd2fa9eb3848b7c9c2 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:56:02 +0900 Subject: locking/lockdep: Introduce CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK=y Add a Kconfig knob that enables the lockdep "crossrelease_fullstack" boot parameter. Suggested-by: Ingo Molnar Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-7-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 160b5d6df7cb..db933d063bfc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -76,7 +76,11 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK +static int crossrelease_fullstack = 1; +#else static int crossrelease_fullstack; +#endif static int __init allow_crossrelease_fullstack(char *str) { crossrelease_fullstack = 1; -- cgit v1.2.3 From fd1a5b04dfb899f84ddeb8acdaea6b98283df1e5 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 25 Oct 2017 17:56:04 +0900 Subject: workqueue: Remove now redundant lock acquisitions wrt. workqueue flushes The workqueue code added manual lock acquisition annotations to catch deadlocks. After lockdepcrossrelease was introduced, some of those became redundant, since wait_for_completion() already does the acquisition and tracking. Remove the duplicate annotations. Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: amir73il@gmail.com Cc: axboe@kernel.dk Cc: darrick.wong@oracle.com Cc: david@fromorbit.com Cc: hch@infradead.org Cc: idryomov@gmail.com Cc: johan@kernel.org Cc: johannes.berg@intel.com Cc: kernel-team@lge.com Cc: linux-block@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Cc: oleg@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1508921765-15396-9-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 39831b2f3c5f..160fdc6e839a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2497,15 +2497,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); - /* - * Explicitly init the crosslock for wq_barrier::done, make its lock - * key a subkey of the corresponding work. As a result we won't - * build a dependency between wq_barrier::done and unrelated work. - */ - lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map, - "(complete)wq_barr::done", - target->lockdep_map.key, 1); - __init_completion(&barr->done); + init_completion_map(&barr->done, &target->lockdep_map); + barr->task = current; /* @@ -2611,16 +2604,13 @@ void flush_workqueue(struct workqueue_struct *wq) struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, - .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), + .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map), }; int next_color; if (WARN_ON(!wq_online)) return; - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); - mutex_lock(&wq->mutex); /* @@ -2883,9 +2873,6 @@ bool flush_work(struct work_struct *work) if (WARN_ON(!wq_online)) return false; - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - if (start_flush_work(work, &barr)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); -- cgit v1.2.3 From 54b933c6c954a8b7b0c2b40a1c4d3f7279d11e22 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Wed, 25 Oct 2017 19:28:27 +0800 Subject: sched/idle: Micro-optimize the idle loop Move the loop-invariant calculation of 'cpu' in do_idle() out of the loop body, because the current CPU is always constant. This improves the generated code both on x86-64 and ARM64: x86-64: Before patch (execution in loop): 864: 0f ae e8 lfence 867: 65 8b 05 c2 38 f1 7e mov %gs:0x7ef138c2(%rip),%eax 86e: 89 c0 mov %eax,%eax 870: 48 0f a3 05 68 19 08 bt %rax,0x1081968(%rip) 877: 01 After patch (execution in loop): 872: 0f ae e8 lfence 875: 4c 0f a3 25 63 19 08 bt %r12,0x1081963(%rip) 87c: 01 ARM64: Before patch (execution in loop): c58: d5033d9f dsb ld c5c: d538d080 mrs x0, tpidr_el1 c60: b8606a61 ldr w1, [x19,x0] c64: 1100fc20 add w0, w1, #0x3f c68: 7100003f cmp w1, #0x0 c6c: 1a81b000 csel w0, w0, w1, lt c70: 13067c00 asr w0, w0, #6 c74: 93407c00 sxtw x0, w0 c78: f8607a80 ldr x0, [x20,x0,lsl #3] c7c: 9ac12401 lsr x1, x0, x1 c80: 36000581 tbz w1, #0, d30 After patch (execution in loop): c84: d5033d9f dsb ld c88: f9400260 ldr x0, [x19] c8c: ea14001f tst x0, x20 c90: 54000580 b.eq d40 Signed-off-by: Cheng Jian [ Rewrote the title and the changelog. ] Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: huawei.libin@huawei.com Cc: xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1508930907-107755-1-git-send-email-cj.chengjian@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b2e8f0afbb42..7dae9eb8c042 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -209,6 +209,7 @@ exit_idle: */ static void do_idle(void) { + int cpu = smp_processor_id(); /* * If the arch has a polling bit, we maintain an invariant: * @@ -225,7 +226,7 @@ static void do_idle(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } -- cgit v1.2.3 From 5aaf1ab55389aeb6ce5527580a1a4d4dbc0f41ff Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 Oct 2017 16:56:50 +0200 Subject: livepatch: Correctly call klp_post_unpatch_callback() in error paths The post_unpatch_enabled flag in struct klp_callbacks is set when a pre-patch callback successfully executes, indicating that we need to call a corresponding post-unpatch callback when the patch is reverted. This is true for ordinary patch disable as well as the error paths of klp_patch_object() callers. As currently coded, we inadvertently execute the post-patch callback twice in klp_module_coming() when klp_patch_object() fails: - We explicitly call klp_post_unpatch_callback() for the failed object - We call it again for the same object (and all the others) via klp_cleanup_module_patches_limited() We should clear the flag in klp_post_unpatch_callback() to make sure that the callback is not called twice. It makes the API more safe. (We could have removed the callback from the former error path as it would be covered by the latter call, but I think that is is cleaner to clear the post_unpatch_enabled after its invoked. For example, someone might later decide to call the callback only when obj->patched flag is set.) There is another mistake in the error path of klp_coming_module() in which it skips the post-unpatch callback for the klp_transition_patch. However, the pre-patch callback was called even for this patch, so be sure to make the corresponding callbacks for all patches. Finally, I used this opportunity to make klp_pre_patch_callback() more readable. [jkosina@suse.cz: incorporate changelog wording changes proposed by Joe Lawrence] Signed-off-by: Petr Mladek Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 4 +--- kernel/livepatch/core.h | 8 +++++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index cafb5a84417d..eb134479c394 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -894,9 +894,7 @@ int klp_module_coming(struct module *mod) pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", patch->mod->name, obj->mod->name, ret); - if (patch != klp_transition_patch) - klp_post_unpatch_callback(obj); - + klp_post_unpatch_callback(obj); goto err; } diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index 6fc907b54e71..cc3aa708e0b4 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -12,10 +12,10 @@ static inline bool klp_is_object_loaded(struct klp_object *obj) static inline int klp_pre_patch_callback(struct klp_object *obj) { - int ret; + int ret = 0; - ret = (obj->callbacks.pre_patch) ? - (*obj->callbacks.pre_patch)(obj) : 0; + if (obj->callbacks.pre_patch) + ret = (*obj->callbacks.pre_patch)(obj); obj->callbacks.post_unpatch_enabled = !ret; @@ -39,6 +39,8 @@ static inline void klp_post_unpatch_callback(struct klp_object *obj) if (obj->callbacks.post_unpatch_enabled && obj->callbacks.post_unpatch) (*obj->callbacks.post_unpatch)(obj); + + obj->callbacks.post_unpatch_enabled = false; } #endif /* _LIVEPATCH_CORE_H */ -- cgit v1.2.3 From 89a9a1c1c89cea5f70975c338c011b9f7024dee5 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 Oct 2017 16:56:51 +0200 Subject: livepatch: __klp_disable_patch() should never be called for disabled patches __klp_disable_patch() should never be called when the patch is not enabled. Let's add the same warning that we have in __klp_enable_patch(). This allows to remove the check when calling klp_pre_unpatch_callback(). It was strange anyway because it repeatedly checked per-patch flag for each patched object. Signed-off-by: Petr Mladek Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb134479c394..287f71e9dbfe 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -282,6 +282,9 @@ static int __klp_disable_patch(struct klp_patch *patch) { struct klp_object *obj; + if (WARN_ON(!patch->enabled)) + return -EINVAL; + if (klp_transition_patch) return -EBUSY; @@ -293,7 +296,7 @@ static int __klp_disable_patch(struct klp_patch *patch) klp_init_transition(patch, KLP_UNPATCHED); klp_for_each_object(patch, obj) - if (patch->enabled && obj->patched) + if (obj->patched) klp_pre_unpatch_callback(obj); /* -- cgit v1.2.3 From d41bf8c9deaed1a90b18d3ffc5639d4c19f0259a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Oct 2017 16:18:27 -0700 Subject: cgroup, sched: Move basic cpu stats from cgroup.stat to cpu.stat The basic cpu stat is currently shown with "cpu." prefix in cgroup.stat, and the same information is duplicated in cpu.stat when cpu controller is enabled. This is ugly and not very scalable as we want to expand the coverage of stat information which is always available. This patch makes cgroup core always create "cpu.stat" file and show the basic cpu stat there and calls the cpu controller to show the extra stats when enabled. This ensures that the same information isn't presented in multiple places and makes future expansion of basic stats easier. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) --- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup.c | 60 +++++++++++++++++++++++++++++++++++++++-- kernel/cgroup/stat.c | 10 +++---- kernel/sched/core.c | 13 +++------ 4 files changed, 68 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index fa642c99586a..4dc317090920 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -205,6 +205,7 @@ int cgroup_task_count(const struct cgroup *cgrp); void cgroup_stat_flush(struct cgroup *cgrp); int cgroup_stat_init(struct cgroup *cgrp); void cgroup_stat_exit(struct cgroup *cgrp); +void cgroup_stat_show_cputime(struct seq_file *seq); void cgroup_stat_boot(void); /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7975b20f1fd1..d9773e49a1b4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -463,6 +463,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return &cgrp->self; } +/** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (!css || !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest @@ -3311,11 +3333,40 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); - cgroup_stat_show_cputime(seq, "cpu."); - return 0; } +static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, + struct cgroup *cgrp, int ssid) +{ + struct cgroup_subsys *ss = cgroup_subsys[ssid]; + struct cgroup_subsys_state *css; + int ret; + + if (!ss->css_extra_stat_show) + return 0; + + css = cgroup_tryget_css(cgrp, ss); + if (!css) + return 0; + + ret = ss->css_extra_stat_show(seq, css); + css_put(css); + return ret; +} + +static int cpu_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + int ret = 0; + + cgroup_stat_show_cputime(seq); +#ifdef CONFIG_CGROUP_SCHED + ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); +#endif + return ret; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4423,6 +4474,11 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, + { + .name = "cpu.stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_stat_show, + }, { } /* terminate */ }; diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 9cce79e89320..133b465691d6 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c @@ -256,7 +256,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, cgroup_cpu_stat_account_end(cgrp, cstat); } -void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix) +void cgroup_stat_show_cputime(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; @@ -278,10 +278,10 @@ void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix) do_div(utime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC); - seq_printf(seq, "%susage_usec %llu\n" - "%suser_usec %llu\n" - "%ssystem_usec %llu\n", - prefix, usage, prefix, utime, prefix, stime); + seq_printf(seq, "usage_usec %llu\n" + "user_usec %llu\n" + "system_usec %llu\n", + usage, utime, stime); } int cgroup_stat_init(struct cgroup *cgrp) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ad255162a830..0b3eec389552 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6678,13 +6678,12 @@ static struct cftype cpu_legacy_files[] = { { } /* Terminate */ }; -static int cpu_stat_show(struct seq_file *sf, void *v) +static int cpu_extra_stat_show(struct seq_file *sf, + struct cgroup_subsys_state *css) { - cgroup_stat_show_cputime(sf, ""); - #ifdef CONFIG_CFS_BANDWIDTH { - struct task_group *tg = css_tg(seq_css(sf)); + struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; u64 throttled_usec; @@ -6817,11 +6816,6 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, #endif static struct cftype cpu_files[] = { - { - .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cpu_stat_show, - }, #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "weight", @@ -6852,6 +6846,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, + .css_extra_stat_show = cpu_extra_stat_show, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, -- cgit v1.2.3 From 7863406143d8bbbbda07a61285c5f4c217908dfd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:28 +0200 Subject: sched/isolation: Move housekeeping related code to its own file The housekeeping code is currently tied to the NOHZ code. As we are planning to make housekeeping independent from it, start with moving the relevant code to its own file. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-2-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/rcu/tree_plugin.h | 1 + kernel/rcu/update.c | 1 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 1 + kernel/sched/fair.c | 1 + kernel/sched/isolation.c | 33 +++++++++++++++++++++++++++++++++ kernel/time/tick-sched.c | 18 ------------------ kernel/watchdog.c | 1 + 8 files changed, 39 insertions(+), 18 deletions(-) create mode 100644 kernel/sched/isolation.c (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..c7632f51c5a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "../time/tick-internal.h" diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5033b66d2753..79abeb0ca329 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -51,6 +51,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 78f54932ea1d..871d43d73375 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -26,3 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o +obj-$(CONFIG_NO_HZ_FULL) += isolation.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2288a145bf01..ad188acb7636 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56f343b8e749..591481db8c6a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,6 +32,7 @@ #include #include #include +#include #include diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c new file mode 100644 index 000000000000..3589252ed476 --- /dev/null +++ b/kernel/sched/isolation.c @@ -0,0 +1,33 @@ +/* + * Housekeeping management. Manage the targets for routine code that can run on + * any CPU: unbound workqueues, timers, kthreads and any offloadable work. + * + * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker + * + */ + +#include +#include +#include +#include + +cpumask_var_t housekeeping_mask; + +void __init housekeeping_init(void) +{ + if (!tick_nohz_full_enabled()) + return; + + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); + + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); +} diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7b258c59d78a..27d7d522ac4e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -166,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; -cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; static atomic_t tick_dep_mask; @@ -438,13 +437,6 @@ void __init tick_nohz_init(void) return; } - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); - cpumask_clear(tick_nohz_full_mask); - tick_nohz_full_running = false; - return; - } - /* * Full dynticks uses irq work to drive the tick rescheduling on safe * locking contexts. But then we need irq work to raise its own @@ -453,7 +445,6 @@ void __init tick_nohz_init(void) if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); - cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; return; } @@ -466,9 +457,6 @@ void __init tick_nohz_init(void) cpumask_clear_cpu(cpu, tick_nohz_full_mask); } - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, tick_nohz_full_mask); - for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); @@ -478,12 +466,6 @@ void __init tick_nohz_init(void) WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); - - /* - * We need at least one CPU to handle housekeeping work such - * as timekeeping, unbound timers, workqueues, ... - */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6bcb854909c0..3c44dbaa4b9f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 13316b31fdaaa45f06793eb7992588359ba6ab9f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:29 +0200 Subject: sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version While trying to disable the watchog on nohz_full CPUs, the watchdog implements an ad-hoc version of housekeeping_cpumask(). Lets replace those re-invented lines. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-3-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3c44dbaa4b9f..562652c9c815 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -774,15 +774,10 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, void __init lockup_detector_init(void) { -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) { + if (tick_nohz_full_enabled()) pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_copy(&watchdog_cpumask, housekeeping_mask); - } else - cpumask_copy(&watchdog_cpumask, cpu_possible_mask); -#else - cpumask_copy(&watchdog_cpumask, cpu_possible_mask); -#endif + + cpumask_copy(&watchdog_cpumask, housekeeping_cpumask()); if (!watchdog_nmi_probe()) nmi_watchdog_available = true; -- cgit v1.2.3 From 7e56a1cf4b28f5739526877b8dbad623fae2e4e7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:31 +0200 Subject: sched/isolation: Make the housekeeping cpumask private Nobody needs to access this detail. housekeeping_cpumask() already takes care of it. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-5-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/isolation.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 3589252ed476..16445097eb25 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -11,7 +11,41 @@ #include #include -cpumask_var_t housekeeping_mask; +static cpumask_var_t housekeeping_mask; + +int housekeeping_any_cpu(void) +{ + if (tick_nohz_full_enabled()) + return cpumask_any_and(housekeeping_mask, cpu_online_mask); + + return smp_processor_id(); +} +EXPORT_SYMBOL_GPL(housekeeping_any_cpu); + +const struct cpumask *housekeeping_cpumask(void) +{ + if (tick_nohz_full_enabled()) + return housekeeping_mask; + + return cpu_possible_mask; +} +EXPORT_SYMBOL_GPL(housekeeping_cpumask); + +void housekeeping_affine(struct task_struct *t) +{ + if (tick_nohz_full_enabled()) + set_cpus_allowed_ptr(t, housekeeping_mask); +} +EXPORT_SYMBOL_GPL(housekeeping_affine); + +bool housekeeping_test_cpu(int cpu) +{ + if (tick_nohz_full_enabled()) + return cpumask_test_cpu(cpu, housekeeping_mask); + + return true; +} +EXPORT_SYMBOL_GPL(housekeeping_test_cpu); void __init housekeeping_init(void) { -- cgit v1.2.3 From e179f5a04ba46ee5c5439480c2bfd68c358168b7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:32 +0200 Subject: sched/isolation: Use its own static key Housekeeping code still depends on the nohz_full static key. Since we want to decouple housekeeping from NOHZ, let's create a housekeeping specific static key. It's mostly relevant for calls to is_housekeeping_cpu() from the scheduler. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-6-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/isolation.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 16445097eb25..bb8ba19a0235 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -10,12 +10,15 @@ #include #include #include +#include +DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); +EXPORT_SYMBOL_GPL(housekeeping_overriden); static cpumask_var_t housekeeping_mask; int housekeeping_any_cpu(void) { - if (tick_nohz_full_enabled()) + if (static_branch_unlikely(&housekeeping_overriden)) return cpumask_any_and(housekeeping_mask, cpu_online_mask); return smp_processor_id(); @@ -24,7 +27,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu); const struct cpumask *housekeeping_cpumask(void) { - if (tick_nohz_full_enabled()) + if (static_branch_unlikely(&housekeeping_overriden)) return housekeeping_mask; return cpu_possible_mask; @@ -33,14 +36,14 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask); void housekeeping_affine(struct task_struct *t) { - if (tick_nohz_full_enabled()) + if (static_branch_unlikely(&housekeeping_overriden)) set_cpus_allowed_ptr(t, housekeeping_mask); } EXPORT_SYMBOL_GPL(housekeeping_affine); bool housekeeping_test_cpu(int cpu) { - if (tick_nohz_full_enabled()) + if (static_branch_unlikely(&housekeeping_overriden)) return cpumask_test_cpu(cpu, housekeeping_mask); return true; @@ -62,6 +65,8 @@ void __init housekeeping_init(void) cpumask_andnot(housekeeping_mask, cpu_possible_mask, tick_nohz_full_mask); + static_branch_enable(&housekeeping_overriden); + /* We need at least one CPU to handle housekeeping work */ WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } -- cgit v1.2.3 From 204c083a009378dfa751175b5fcddc75988bab6c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:33 +0200 Subject: sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu() Fit it into the housekeeping_*() namespace. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-7-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ad188acb7636..d0fb448dd43a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -527,7 +527,7 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) + if (!idle_cpu(cpu) && housekeeping_cpu(cpu)) return cpu; rcu_read_lock(); @@ -536,14 +536,14 @@ int get_nohz_timer_target(void) if (cpu == i) continue; - if (!idle_cpu(i) && is_housekeeping_cpu(i)) { + if (!idle_cpu(i) && housekeeping_cpu(i)) { cpu = i; goto unlock; } } } - if (!is_housekeeping_cpu(cpu)) + if (!housekeeping_cpu(cpu)) cpu = housekeeping_any_cpu(); unlock: rcu_read_unlock(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 591481db8c6a..cdece8f967f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!is_housekeeping_cpu(cpu)) + if (!housekeeping_cpu(cpu)) return; if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) -- cgit v1.2.3 From 5c4991e24c69737bd41fc2737b1e3980abbf73f9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:34 +0200 Subject: sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL Split the housekeeping config from CONFIG_NO_HZ_FULL. This way we finally separate the isolation code from NOHZ. Although a dependency to CONFIG_NO_HZ_FULL remains for now, while the housekeeping code still deals with NOHZ internals. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-8-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 871d43d73375..dbe61302c352 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -26,4 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o -obj-$(CONFIG_NO_HZ_FULL) += isolation.o +obj-$(CONFIG_CPU_ISOLATION) += isolation.o -- cgit v1.2.3 From de201559df872f83d0c08fb4effe3efd28e6cbc8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:35 +0200 Subject: sched/isolation: Introduce housekeeping flags Before we implement isolcpus under housekeeping, we need the isolation features to be more finegrained. For example some people want NOHZ_FULL without the full scheduler isolation, others want full scheduler isolation without NOHZ_FULL. So let's cut all these isolation features piecewise, at the risk of overcutting it right now. We can still merge some flags later if they always make sense together. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-9-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/update.c | 2 +- kernel/sched/core.c | 8 ++++---- kernel/sched/fair.c | 2 +- kernel/sched/isolation.c | 26 +++++++++++++++----------- kernel/watchdog.c | 3 ++- 6 files changed, 24 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c7632f51c5a0..34125d23e58a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2584,7 +2584,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; - housekeeping_affine(current); + housekeeping_affine(current, HK_FLAG_RCU); } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 79abeb0ca329..e3e60efaafee 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -719,7 +719,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) LIST_HEAD(rcu_tasks_holdouts); /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current); + housekeeping_affine(current, HK_FLAG_RCU); /* * Each pass through the following loop makes one check for diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d0fb448dd43a..2210c0203e51 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -527,7 +527,7 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu)) + if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) return cpu; rcu_read_lock(); @@ -536,15 +536,15 @@ int get_nohz_timer_target(void) if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i)) { + if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu)) - cpu = housekeeping_any_cpu(); + if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) + cpu = housekeeping_any_cpu(HK_FLAG_TIMER); unlock: rcu_read_unlock(); return cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cdece8f967f0..f755de86a813 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu)) + if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) return; if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index bb8ba19a0235..37a138a780ff 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -15,37 +15,39 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden); static cpumask_var_t housekeeping_mask; +static unsigned int housekeeping_flags; -int housekeeping_any_cpu(void) +int housekeeping_any_cpu(enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - return cpumask_any_and(housekeeping_mask, cpu_online_mask); - + if (housekeeping_flags & flags) + return cpumask_any_and(housekeeping_mask, cpu_online_mask); return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); -const struct cpumask *housekeeping_cpumask(void) +const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - return housekeeping_mask; - + if (housekeeping_flags & flags) + return housekeeping_mask; return cpu_possible_mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); -void housekeeping_affine(struct task_struct *t) +void housekeeping_affine(struct task_struct *t, enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - set_cpus_allowed_ptr(t, housekeeping_mask); + if (housekeeping_flags & flags) + set_cpus_allowed_ptr(t, housekeeping_mask); } EXPORT_SYMBOL_GPL(housekeeping_affine); -bool housekeeping_test_cpu(int cpu) +bool housekeeping_test_cpu(int cpu, enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - return cpumask_test_cpu(cpu, housekeeping_mask); - + if (housekeeping_flags & flags) + return cpumask_test_cpu(cpu, housekeeping_mask); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); @@ -65,6 +67,8 @@ void __init housekeeping_init(void) cpumask_andnot(housekeeping_mask, cpu_possible_mask, tick_nohz_full_mask); + housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + static_branch_enable(&housekeeping_overriden); /* We need at least one CPU to handle housekeeping work */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 562652c9c815..e9e2ebb3db29 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -777,7 +777,8 @@ void __init lockup_detector_init(void) if (tick_nohz_full_enabled()) pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_copy(&watchdog_cpumask, housekeeping_cpumask()); + cpumask_copy(&watchdog_cpumask, + housekeeping_cpumask(HK_FLAG_TIMER)); if (!watchdog_nmi_probe()) nmi_watchdog_available = true; -- cgit v1.2.3 From 6f1982fedd59856bcc42a9b521be4c3ffd2f60a7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:36 +0200 Subject: sched/isolation: Handle the nohz_full= parameter We want to centralize the isolation management, done by the housekeeping subsystem. Therefore we need to handle the nohz_full= parameter from there. Since nohz_full= so far has involved unbound timers, watchdog, RCU and tilegx NAPI isolation, we keep that default behaviour. nohz_full= will be deprecated in the future. We want to control the isolation features from the isolcpus= parameter. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-10-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/isolation.c | 42 ++++++++++++++++++++++++++++++------------ kernel/time/tick-sched.c | 13 +++---------- 2 files changed, 33 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 37a138a780ff..1f61e440358d 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -54,23 +54,41 @@ EXPORT_SYMBOL_GPL(housekeeping_test_cpu); void __init housekeeping_init(void) { - if (!tick_nohz_full_enabled()) + if (!housekeeping_flags) return; - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); - cpumask_clear(tick_nohz_full_mask); - tick_nohz_full_running = false; - return; + static_branch_enable(&housekeeping_overriden); + + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); +} + +#ifdef CONFIG_NO_HZ_FULL +static int __init housekeeping_nohz_full_setup(char *str) +{ + cpumask_var_t non_housekeeping_mask; + + alloc_bootmem_cpumask_var(&non_housekeeping_mask); + if (cpulist_parse(str, non_housekeeping_mask) < 0) { + pr_warn("Housekeeping: Incorrect nohz_full cpumask\n"); + free_bootmem_cpumask_var(non_housekeeping_mask); + return 0; } - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, tick_nohz_full_mask); + alloc_bootmem_cpumask_var(&housekeeping_mask); + cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); - housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + if (cpumask_empty(housekeeping_mask)) + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); - static_branch_enable(&housekeeping_overriden); + housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER | + HK_FLAG_RCU | HK_FLAG_MISC; - /* We need at least one CPU to handle housekeeping work */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); + tick_nohz_full_setup(non_housekeeping_mask); + + free_bootmem_cpumask_var(non_housekeeping_mask); + + return 1; } +__setup("nohz_full=", housekeeping_nohz_full_setup); +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 27d7d522ac4e..69f3dbe38984 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -385,20 +385,13 @@ out: local_irq_restore(flags); } -/* Parse the boot-time nohz CPU list from the kernel parameters. */ -static int __init tick_nohz_full_setup(char *str) +/* Get the boot-time nohz CPU list from the kernel parameters. */ +void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); - if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); - free_bootmem_cpumask_var(tick_nohz_full_mask); - return 1; - } + cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; - - return 1; } -__setup("nohz_full=", tick_nohz_full_setup); static int tick_nohz_cpu_down(unsigned int cpu) { -- cgit v1.2.3 From edb9382175c3ebdced8ffdb3e0f20052ad9fdbe9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:37 +0200 Subject: sched/isolation: Move isolcpus= handling to the housekeeping code We want to centralize the isolation features, to be done by the housekeeping subsystem and scheduler domain isolation is a significant part of it. No intended behaviour change, we just reuse the housekeeping cpumask and core code. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-11-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 15 ++++-------- kernel/sched/core.c | 16 +----------- kernel/sched/isolation.c | 63 ++++++++++++++++++++++++++++++++++++++---------- kernel/sched/topology.c | 24 +++++------------- 4 files changed, 62 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4657e2924ecb..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -57,7 +57,7 @@ #include #include #include - +#include #include #include #include @@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ - cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; - if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) - goto done; - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - non_isolated_cpus); + housekeeping_cpumask(HK_FLAG_DOMAIN)); goto done; } @@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains, */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_FLAG_DOMAIN)))) continue; if (is_sched_load_balance(cp)) @@ -789,7 +785,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, non_isolated_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -802,7 +798,6 @@ restart: BUG_ON(nslot != ndoms); done: - free_cpumask_var(non_isolated_cpus); kfree(csa); /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2210c0203e51..1a55c842bfbc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -84,9 +84,6 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* CPUs with isolated domains */ -cpumask_var_t cpu_isolated_map; - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -5735,10 +5732,6 @@ static inline void sched_init_smt(void) { } void __init sched_init_smp(void) { - cpumask_var_t non_isolated_cpus; - - alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - sched_init_numa(); /* @@ -5748,16 +5741,12 @@ void __init sched_init_smp(void) */ mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - if (cpumask_empty(non_isolated_cpus)) - cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); sched_init_granularity(); - free_cpumask_var(non_isolated_cpus); init_sched_rt_class(); init_sched_dl_class(); @@ -5961,9 +5950,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - /* May be allocated at isolcpus cmdline parse time */ - if (cpu_isolated_map == NULL) - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); set_cpu_rq_start_time(smp_processor_id()); #endif diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 1f61e440358d..8f666bc5abe8 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -63,32 +63,69 @@ void __init housekeeping_init(void) WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } -#ifdef CONFIG_NO_HZ_FULL -static int __init housekeeping_nohz_full_setup(char *str) +static int __init housekeeping_setup(char *str, enum hk_flags flags) { cpumask_var_t non_housekeeping_mask; + int err; alloc_bootmem_cpumask_var(&non_housekeeping_mask); - if (cpulist_parse(str, non_housekeeping_mask) < 0) { - pr_warn("Housekeeping: Incorrect nohz_full cpumask\n"); + err = cpulist_parse(str, non_housekeeping_mask); + if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) { + pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); free_bootmem_cpumask_var(non_housekeeping_mask); return 0; } - alloc_bootmem_cpumask_var(&housekeeping_mask); - cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); - - if (cpumask_empty(housekeeping_mask)) - cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + if (!housekeeping_flags) { + alloc_bootmem_cpumask_var(&housekeeping_mask); + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, non_housekeeping_mask); + if (cpumask_empty(housekeeping_mask)) + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + } else { + cpumask_var_t tmp; + + alloc_bootmem_cpumask_var(&tmp); + cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); + if (!cpumask_equal(tmp, housekeeping_mask)) { + pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); + free_bootmem_cpumask_var(tmp); + free_bootmem_cpumask_var(non_housekeeping_mask); + return 0; + } + free_bootmem_cpumask_var(tmp); + } - housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER | - HK_FLAG_RCU | HK_FLAG_MISC; + if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { + if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + tick_nohz_full_setup(non_housekeeping_mask); + } else { + pr_warn("Housekeeping: nohz unsupported." + " Build with CONFIG_NO_HZ_FULL\n"); + free_bootmem_cpumask_var(non_housekeeping_mask); + return 0; + } + } - tick_nohz_full_setup(non_housekeeping_mask); + housekeeping_flags |= flags; free_bootmem_cpumask_var(non_housekeeping_mask); return 1; } + +static int __init housekeeping_nohz_full_setup(char *str) +{ + unsigned int flags; + + flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + + return housekeeping_setup(str, flags); +} __setup("nohz_full=", housekeeping_nohz_full_setup); -#endif + +static int __init housekeeping_isolcpus_setup(char *str) +{ + return housekeeping_setup(str, HK_FLAG_DOMAIN); +} +__setup("isolcpus=", housekeeping_isolcpus_setup); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e3d31b0880dc..2e6b9126ffdc 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,6 +3,7 @@ */ #include #include +#include #include "sched.h" @@ -469,21 +470,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* Setup the mask of CPUs configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - int ret; - - alloc_bootmem_cpumask_var(&cpu_isolated_map); - ret = cpulist_parse(str, cpu_isolated_map); - if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1); - return 0; - } - return 1; -} -__setup("isolcpus=", isolated_cpu_setup); - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -1792,7 +1778,7 @@ int sched_init_domains(const struct cpumask *cpu_map) doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; - cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); + cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); @@ -1875,7 +1861,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], doms_new = alloc_sched_domains(1); if (doms_new) { n = 1; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + cpumask_and(doms_new[0], cpu_active_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); } } else { n = ndoms_new; @@ -1898,7 +1885,8 @@ match1: if (!doms_new) { n = 0; doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + cpumask_and(doms_new[0], cpu_active_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); } /* Build new domains: */ -- cgit v1.2.3 From 150dfee95feb913876ced5cc559a342384e8ea97 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:38 +0200 Subject: sched/isolation: Add basic isolcpus flags Add flags to control NOHZ and domain isolation from "isolcpus=", in order to centralize the isolation features to a common interface. Domain isolation remains the default so not to break the existing isolcpus boot paramater behaviour. Further flags in the future may include 0hz (1hz tick offload) and timers, workqueue, RCU, kthread, watchdog, likely all merged together in a common flag ("async"?). In any case, this will have to be modifiable by cpusets. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-12-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/isolation.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 8f666bc5abe8..b71b436f59f2 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -11,6 +11,7 @@ #include #include #include +#include DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden); @@ -126,6 +127,29 @@ __setup("nohz_full=", housekeeping_nohz_full_setup); static int __init housekeeping_isolcpus_setup(char *str) { - return housekeeping_setup(str, HK_FLAG_DOMAIN); + unsigned int flags = 0; + + while (isalpha(*str)) { + if (!strncmp(str, "nohz,", 5)) { + str += 5; + flags |= HK_FLAG_TICK; + continue; + } + + if (!strncmp(str, "domain,", 7)) { + str += 7; + flags |= HK_FLAG_DOMAIN; + continue; + } + + pr_warn("isolcpus: Error, unknown flag\n"); + return 0; + } + + /* Default behaviour for isolcpus without flags */ + if (!flags) + flags |= HK_FLAG_DOMAIN; + + return housekeeping_setup(str, flags); } __setup("isolcpus=", housekeeping_isolcpus_setup); -- cgit v1.2.3 From 7d9285e82db5defca4d9674ba089429eeca0c697 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:19 -0700 Subject: perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf: perf event change needed for subsequent bpf helpers" eBPF programs would like access to the (perf) event enabled and running times along with the event value, such that they can deal with event multiplexing (among other things). This patch extends the interface; a future eBPF patch will utilize the new functionality. [ Note, there's a same-content commit with a poor changelog and a meaningless title in the networking tree as well - but we need this change for subsequent perf work, so apply it here as well, with a proper changelog. Hopefully Git will be able to sort out this somewhat messy workflow, if there are no other, conflicting changes to these files. ] Signed-off-by: Yonghong Song [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David S. Miller Link: http://lkml.kernel.org/r/20171005161923.332790-2-yhs@fb.com Signed-off-by: Ingo Molnar --- kernel/bpf/arraymap.c | 2 +- kernel/events/core.c | 15 +++++++++++++-- kernel/trace/bpf_trace.c | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e2636737b69b..c4b9ab01bba5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/events/core.c b/kernel/events/core.c index 04989fb769f0..74671e101b92 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; + u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event->oncpu == smp_processor_id()) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index dc498b605d5d..95888ae6c263 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value); + err = perf_event_read_local(ee->event, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi -- cgit v1.2.3 From ca0dd44cf37bfe316751e1701439c3ff44beb05c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:23:44 +0200 Subject: perf/core: Fix perf_event_read_value() locking perf_event_read_value() is an external accessor, just like perf_event_{en,dis}able() and should thus use perf_event_ctx_lock(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f63a8daa5812 ("perf: Fix event->ctx locking") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 74671e101b92..d636f1d0e2ab 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4398,7 +4398,7 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; @@ -4426,6 +4426,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) return total; } + +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + count = __perf_event_read_value(event, enabled, running); + perf_event_ctx_unlock(event, ctx); + + return count; +} EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, @@ -4528,7 +4540,7 @@ static int perf_read_one(struct perf_event *event, u64 values[4]; int n = 0; - values[n++] = perf_event_read_value(event, &enabled, &running); + values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) -- cgit v1.2.3 From 0ee098c97a6ebe163180e74b740c0a37bcdaa173 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:24:28 +0200 Subject: perf/core: Update ctx time before detaching events We should make sure the ctx time is updated before we detach events; which will want to update event times. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d636f1d0e2ab..5fae98626b15 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11129,6 +11129,7 @@ static void __perf_event_exit_context(void *__info) struct perf_event *event; raw_spin_lock(&ctx->lock); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); -- cgit v1.2.3 From a9cd8194e1e6bd09619954721dfaf0f94fe2003e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:38:24 +0200 Subject: perf/core: Fix __perf_read_group_add() locking Event timestamps are serialized using ctx->lock, make sure to hold it over reading all values. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5fae98626b15..0f34f2a47eb6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4453,6 +4453,8 @@ static int __perf_read_group_add(struct perf_event *leader, if (ret) return ret; + raw_spin_lock_irqsave(&ctx->lock, flags); + /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one @@ -4475,8 +4477,6 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) -- cgit v1.2.3 From 3c5c8711dcb32115156cc7672fa095afa4339eaa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 13:44:51 +0200 Subject: perf/core: Make sure to update ctx time before using it We should make sure to update ctx time before we use it to update event times. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0f34f2a47eb6..b249e0f197a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1893,6 +1893,11 @@ __perf_remove_from_context(struct perf_event *event, { unsigned long flags = (unsigned long)info; + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); @@ -1955,8 +1960,11 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - update_context_time(ctx); - update_cgrp_time_from_event(event); + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); -- cgit v1.2.3 From 8ca2bd41c7d1c135e9ac6f25970c2d491865088a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 14:12:35 +0200 Subject: perf/core: Rename 'enum perf_event_active_state' Its a weird name, active is one of the states, it should not be part of the name, also, its too long. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b249e0f197a7..6322e245176c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10801,7 +10801,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event *group_leader, struct perf_event_context *child_ctx) { - enum perf_event_active_state parent_state = parent_event->state; + enum perf_event_state parent_state = parent_event->state; struct perf_event *child_event; unsigned long flags; -- cgit v1.2.3 From 7f0ec32526d2446fdd79b0c6c6d08e64ef9fb1f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 14:17:58 +0200 Subject: perf/core: Remove wrong barrier The barrier and comment make no sense: - if what the barrier says is true, it should be wmb() but that should then be part of the arch driver, not the generic code. - if it is an SMP barrier, there must be a matching barrier, and there isn't one. So kill it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6322e245176c..205a4321f678 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2097,11 +2097,6 @@ event_sched_in(struct perf_event *event, event->hw.interrupts = 0; } - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - perf_pmu_disable(event->pmu); perf_set_shadow_time(event, ctx, tstamp); -- cgit v1.2.3 From 0c1cbc18df9e38182a0604b15535699c84d7342a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 16:26:44 +0200 Subject: perf/core: Fix perf_event_read() perf_event_read() has a number of issues regarding the timekeeping bits. - The IPI didn't update group times when it found INACTIVE - The direct call would not re-check ->state after taking ctx->lock which can result in ->count and timestamps getting out of sync. And we can make use of the ordering introduced for perf_event_stop() to make it more accurate for ACTIVE. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 55 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 205a4321f678..6f74f9c35490 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2081,8 +2081,9 @@ event_sched_in(struct perf_event *event, WRITE_ONCE(event->oncpu, smp_processor_id()); /* - * Order event::oncpu write to happen before the ACTIVE state - * is visible. + * Order event::oncpu write to happen before the ACTIVE state is + * visible. This allows perf_event_{stop,read}() to observe the correct + * ->oncpu if it sees ACTIVE. */ smp_wmb(); WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); @@ -3638,12 +3639,16 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) { + if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } - update_event_times(event); + if (!data->group) + update_event_times(event); + else + update_group_times(event); + if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; @@ -3658,7 +3663,6 @@ static void __perf_event_read(void *info) pmu->read(event); list_for_each_entry(sub, &event->sibling_list, group_entry) { - update_event_times(sub); if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since @@ -3748,23 +3752,35 @@ out: static int perf_event_read(struct perf_event *event, bool group) { + enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - struct perf_read_data data = { - .event = event, - .group = group, - .ret = 0, - }; +again: + if (state == PERF_EVENT_STATE_ACTIVE) { + struct perf_read_data data; + + /* + * Orders the ->state and ->oncpu loads such that if we see + * ACTIVE we must also see the right ->oncpu. + * + * Matches the smp_wmb() from event_sched_in(). + */ + smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; + data = (struct perf_read_data){ + .event = event, + .group = group, + .ret = 0, + }; + preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); @@ -3781,20 +3797,27 @@ static int perf_event_read(struct perf_event *event, bool group) (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; - } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + + } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); + state = event->state; + if (state != PERF_EVENT_STATE_INACTIVE) { + raw_spin_unlock_irqrestore(&ctx->lock, flags); + goto again; + } + /* - * may read while context is not active - * (e.g., thread is blocked), in that case - * we cannot update context time + * May read while context is not active (e.g., thread is + * blocked), in that case we cannot update context time */ - if (ctx->is_active) { + if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } + if (group) update_group_times(event); else -- cgit v1.2.3 From 0d3d73aac2ff05c78387aa9dcc2c8aa3804405e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Sep 2017 14:16:28 +0200 Subject: perf/core: Rewrite event timekeeping The current even timekeeping, which computes enabled and running times, uses 3 distinct timestamps to reflect the various event states: OFF (stopped), INACTIVE (enabled) and ACTIVE (running). Furthermore, the update rules are such that even INACTIVE events need their timestamps updated. This is undesirable because we'd like to not touch INACTIVE events if at all possible, this makes event scheduling (much) more expensive than needed. Rewrite the timekeeping to directly use event->state, this greatly simplifies the code and results in only having to update things when we change state, or an up-to-date value is requested (read). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 385 +++++++++++++++++---------------------------------- 1 file changed, 129 insertions(+), 256 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6f74f9c35490..2551e8ce7224 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } +/* + * State based event timekeeping... + * + * The basic idea is to use event->state to determine which (if any) time + * fields to increment with the current delta. This means we only need to + * update timestamps when we change state or when they are explicitly requested + * (read). + * + * Event groups make things a little more complicated, but not terribly so. The + * rules for a group are that if the group leader is OFF the entire group is + * OFF, irrespecive of what the group member states are. This results in + * __perf_effective_state(). + * + * A futher ramification is that when a group leader flips between OFF and + * !OFF, we need to update all group member times. + * + * + * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we + * need to make sure the relevant context time is updated before we try and + * update our timestamps. + */ + +static __always_inline enum perf_event_state +__perf_effective_state(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + + if (leader->state <= PERF_EVENT_STATE_OFF) + return leader->state; + + return event->state; +} + +static __always_inline void +__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) +{ + enum perf_event_state state = __perf_effective_state(event); + u64 delta = now - event->tstamp; + + *enabled = event->total_time_enabled; + if (state >= PERF_EVENT_STATE_INACTIVE) + *enabled += delta; + + *running = event->total_time_running; + if (state >= PERF_EVENT_STATE_ACTIVE) + *running += delta; +} + +static void perf_event_update_time(struct perf_event *event) +{ + u64 now = perf_event_time(event); + + __perf_update_times(event, now, &event->total_time_enabled, + &event->total_time_running); + event->tstamp = now; +} + +static void perf_event_update_sibling_time(struct perf_event *leader) +{ + struct perf_event *sibling; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) + perf_event_update_time(sibling); +} + +static void +perf_event_set_state(struct perf_event *event, enum perf_event_state state) +{ + if (event->state == state) + return; + + perf_event_update_time(event); + /* + * If a group leader gets enabled/disabled all its siblings + * are affected too. + */ + if ((event->state < 0) ^ (state < 0)) + perf_event_update_sibling_time(event); + + WRITE_ONCE(event->state, state); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) event->shadow_ctx_time = now - t->timestamp; } -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ - /* - * when the current task's perf cgroup does not match - * the event's, we need to remember to call the - * perf_mark_enable() function the first time a task with - * a matching perf cgroup is scheduled in. - */ - if (is_cgroup_event(event) && !perf_cgroup_match(event)) - event->cgrp_defer_enabled = 1; -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - if (!event->cgrp_defer_enabled) - return; - - event->cgrp_defer_enabled = 0; - - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - sub->cgrp_defer_enabled = 0; - } - } -} - /* * Update cpuctx->cgrp so that it is set when first cgroup event is added and * cleared when last cgroup event is removed. @@ -972,17 +1020,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return 0; } -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ -} - static inline void list_update_cgroup_event(struct perf_event *event, struct perf_event_context *ctx, bool add) @@ -1396,60 +1433,6 @@ static u64 perf_event_time(struct perf_event *event) return ctx ? ctx->time : 0; } -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - lockdep_assert_held(&ctx->lock); - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - /* - * in cgroup mode, time_enabled represents - * the time the event was enabled AND active - * tasks were in the monitored cgroup. This is - * independent of the activity of the context as - * there may be a mix of cgroup and non-cgroup events. - * - * That is why we treat cgroup events differently - * here. - */ - if (is_cgroup_event(event)) - run_end = perf_cgroup_event_time(event); - else if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; - - event->total_time_enabled = run_end - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = perf_event_time(event); - - event->total_time_running = run_end - event->tstamp_running; - -} - -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; @@ -1492,6 +1475,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; + event->tstamp = perf_event_time(event); + /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that @@ -1699,8 +1684,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) list_del_init(&event->group_entry); - update_group_times(event); - /* * If event was in error state, then keep it * that way, otherwise bogus counts will be @@ -1709,7 +1692,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * of the event */ if (event->state > PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_OFF; + perf_event_set_state(event, PERF_EVENT_STATE_OFF); ctx->generation++; } @@ -1808,38 +1791,24 @@ event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - u64 delta; + enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); - /* - * An event which could not be activated because of - * filter mismatch still needs to have its timings - * maintained, otherwise bogus information is return - * via read() for time_enabled, time_running: - */ - if (event->state == PERF_EVENT_STATE_INACTIVE && - !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; - } - if (event->state != PERF_EVENT_STATE_ACTIVE) return; perf_pmu_disable(event->pmu); - event->tstamp_stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; - event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { event->pending_disable = 0; - event->state = PERF_EVENT_STATE_OFF; + state = PERF_EVENT_STATE_OFF; } + perf_event_set_state(event, state); if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1859,7 +1828,9 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; - int state = group_event->state; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; perf_pmu_disable(ctx->pmu); @@ -1873,7 +1844,7 @@ group_sched_out(struct perf_event *group_event, perf_pmu_enable(ctx->pmu); - if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) + if (group_event->attr.exclusive) cpuctx->exclusive = 0; } @@ -1965,12 +1936,12 @@ static void __perf_event_disable(struct perf_event *event, update_cgrp_time_from_event(event); } - update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); else event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; + + perf_event_set_state(event, PERF_EVENT_STATE_OFF); } /* @@ -2027,8 +1998,7 @@ void perf_event_disable_inatomic(struct perf_event *event) } static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx, - u64 tstamp) + struct perf_event_context *ctx) { /* * use the correct time source for the time snapshot @@ -2056,9 +2026,9 @@ static void perf_set_shadow_time(struct perf_event *event, * is cleaner and simpler to understand. */ if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, tstamp); + perf_cgroup_set_shadow_time(event, event->tstamp); else - event->shadow_ctx_time = tstamp - ctx->timestamp; + event->shadow_ctx_time = event->tstamp - ctx->timestamp; } #define MAX_INTERRUPTS (~0ULL) @@ -2071,7 +2041,6 @@ event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); int ret = 0; lockdep_assert_held(&ctx->lock); @@ -2086,7 +2055,7 @@ event_sched_in(struct perf_event *event, * ->oncpu if it sees ACTIVE. */ smp_wmb(); - WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); + perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2100,19 +2069,17 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx, tstamp); + perf_set_shadow_time(event, ctx); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { - event->state = PERF_EVENT_STATE_INACTIVE; + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; - if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) @@ -2136,8 +2103,6 @@ group_sched_in(struct perf_event *group_event, { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = ctx->pmu; - u64 now = ctx->time; - bool simulate = false; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -2167,27 +2132,13 @@ group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: - * The events up to the failed event are scheduled out normally, - * tstamp_stopped will be updated. - * - * The failed events and the remaining siblings need to have - * their timings updated as if they had gone thru event_sched_in() - * and event_sched_out(). This is required to get consistent timings - * across the group. This also takes care of the case where the group - * could never be scheduled by ensuring tstamp_stopped is set to mark - * the time the event was actually stopped, such that time delta - * calculation in update_event_times() is correct. + * The events up to the failed event are scheduled out normally. */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) - simulate = true; + break; - if (simulate) { - event->tstamp_running += now - event->tstamp_stopped; - event->tstamp_stopped = now; - } else { - event_sched_out(event, cpuctx, ctx); - } + event_sched_out(event, cpuctx, ctx); } event_sched_out(group_event, cpuctx, ctx); @@ -2229,46 +2180,11 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } -/* - * Complement to update_event_times(). This computes the tstamp_* values to - * continue 'enabled' state from @now, and effectively discards the time - * between the prior tstamp_stopped and now (as we were in the OFF state, or - * just switched (context) time base). - * - * This further assumes '@event->state == INACTIVE' (we just came from OFF) and - * cannot have been scheduled in yet. And going into INACTIVE state means - * '@event->tstamp_stopped = @now'. - * - * Thus given the rules of update_event_times(): - * - * total_time_enabled = tstamp_stopped - tstamp_enabled - * total_time_running = tstamp_stopped - tstamp_running - * - * We can insert 'tstamp_stopped == now' and reverse them to compute new - * tstamp_* values. - */ -static void __perf_event_enable_time(struct perf_event *event, u64 now) -{ - WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); - - event->tstamp_stopped = now; - event->tstamp_enabled = now - event->total_time_enabled; - event->tstamp_running = now - event->total_time_running; -} - static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - list_add_event(event, ctx); perf_group_attach(event); - /* - * We can be called with event->state == STATE_OFF when we create with - * .disabled = 1. In that case the IOC_ENABLE will call this function. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2499,28 +2415,6 @@ again: raw_spin_unlock_irq(&ctx->lock); } -/* - * Put a event into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_event_mark_enabled(struct perf_event *event) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - event->state = PERF_EVENT_STATE_INACTIVE; - __perf_event_enable_time(event, tstamp); - list_for_each_entry(sub, &event->sibling_list, group_entry) { - /* XXX should not be > INACTIVE if event isn't */ - if (sub->state >= PERF_EVENT_STATE_INACTIVE) - __perf_event_enable_time(sub, tstamp); - } -} - /* * Cross CPU call to enable a performance event */ @@ -2539,14 +2433,12 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->is_active) ctx_sched_out(ctx, cpuctx, EVENT_TIME); - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); if (!ctx->is_active) return; if (!event_filter_match(event)) { - if (is_cgroup_event(event)) - perf_cgroup_defer_enabled(event); ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } @@ -2866,18 +2758,10 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - switch (event->state) { - case PERF_EVENT_STATE_ACTIVE: + if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); - /* fall-through */ - case PERF_EVENT_STATE_INACTIVE: - update_event_times(event); - break; - - default: - break; - } + perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event @@ -3114,10 +2998,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -3125,10 +3005,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, * If this pinned group hasn't been scheduled, * put it in error state. */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_ERROR; - } + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } @@ -3150,10 +3028,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -3545,7 +3419,7 @@ static int event_enable_on_exec(struct perf_event *event, if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } @@ -3644,10 +3518,9 @@ static void __perf_event_read(void *info) update_cgrp_time_from_event(event); } - if (!data->group) - update_event_times(event); - else - update_group_times(event); + perf_event_update_time(event); + if (data->group) + perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; @@ -3696,7 +3569,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value, { unsigned long flags; int ret = 0; - u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3727,23 +3599,26 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } - now = event->shadow_ctx_time + perf_clock(); - if (enabled) - *enabled = now - event->tstamp_enabled; + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) { + if (event->oncpu == smp_processor_id()) event->pmu->read(event); - if (running) - *running = now - event->tstamp_running; - } else if (running) { - *running = event->total_time_running; - } *value = local64_read(&event->count); + if (enabled || running) { + u64 now = event->shadow_ctx_time + perf_clock(); + u64 __enabled, __running; + + __perf_update_times(event, now, &__enabled, &__running); + if (enabled) + *enabled = __enabled; + if (running) + *running = __running; + } out: local_irq_restore(flags); @@ -3818,10 +3693,9 @@ again: update_cgrp_time_from_event(event); } + perf_event_update_time(event); if (group) - update_group_times(event); - else - update_event_times(event); + perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -4945,8 +4819,7 @@ static void calc_timer_values(struct perf_event *event, *now = perf_clock(); ctx_time = event->shadow_ctx_time + *now; - *enabled = ctx_time - event->tstamp_enabled; - *running = ctx_time - event->tstamp_running; + __perf_update_times(event, ctx_time, enabled, running); } static void perf_event_init_userpage(struct perf_event *event) @@ -10581,7 +10454,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ + perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); /* -- cgit v1.2.3 From 035226b964c820f65e201cdf123705a8f1d7c670 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Thu, 26 Oct 2017 01:47:42 +0000 Subject: bpf: remove tail_call and get_stackid helper declarations from bpf.h commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related syscall") included linux/bpf.h in linux/security.h. As a result, bpf programs including bpf_helpers.h and some other header that ends up pulling in also security.h, such as several examples under samples/bpf, fail to compile because bpf_tail_call and bpf_get_stackid are now "redefined as different kind of symbol". >From bpf.h: u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); Whereas in bpf_helpers.h they are: static void (*bpf_tail_call)(void *ctx, void *map, int index); static int (*bpf_get_stackid)(void *ctx, void *map, int flags); Fix this by removing the unused declaration of bpf_tail_call and moving the declaration of bpf_get_stackid in bpf_trace.c, which is the only place where it's needed. Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b65011d320e3..136aa6bb0422 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -15,6 +15,8 @@ #include #include "trace.h" +u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event -- cgit v1.2.3 From 8108a77515126f6db4374e8593956e20430307c0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:34 -0700 Subject: bpf: bpf_compute_data uses incorrect cb structure SK_SKB program types use bpf_compute_data to store the end of the packet data. However, bpf_compute_data assumes the cb is stored in the qdisc layer format. But, for SK_SKB this is the wrong layer of the stack for this type. It happens to work (sort of!) because in most cases nothing happens to be overwritten today. This is very fragile and error prone. Fortunately, we have another hole in tcp_skb_cb we can use so lets put the data_end value there. Note, SK_SKB program types do not use data_meta, they are failed by sk_skb_is_valid_access(). Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 2b6eb35ae5d3..6778fb773934 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -93,6 +93,14 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +/* compute the linear packet data range [data, data_end) for skb when + * sk_skb type programs are in use. + */ +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} + static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -108,7 +116,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.map = NULL; skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -368,7 +376,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); -- cgit v1.2.3 From bfa640757e9378c2f26867e723f1287e94f5a7ad Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:53 -0700 Subject: bpf: rename sk_actions to align with bpf infrastructure Recent additions to support multiple programs in cgroups impose a strict requirement, "all yes is yes, any no is no". To enforce this the infrastructure requires the 'no' return code, SK_DROP in this case, to be 0. To apply these rules to SK_SKB program types the sk_actions return codes need to be adjusted. This fix adds SK_PASS and makes 'SK_DROP = 0'. Finally, remove SK_ABORTED to remove any chance that the API may allow aborted program flows to be passed up the stack. This would be incorrect behavior and allow programs to break existing policies. Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6778fb773934..66f00a2b27f4 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -122,7 +122,8 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) preempt_enable(); skb->sk = NULL; - return rc; + return rc == SK_PASS ? + (TCP_SKB_CB(skb)->bpf.map ? SK_REDIRECT : SK_PASS) : SK_DROP; } static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) -- cgit v1.2.3 From 250a53d6fcd86012935d1cf71eb2e3d6e88c412c Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 27 Oct 2017 10:34:33 +0200 Subject: genirq: Document vcpu_info usage for percpu_devid interrupts It is currently unclear how to set the VCPU affinity for a percpu_devid interrupt , since the Linux irq_data structure describes the state for multiple interrupts, one for each physical CPU on the system. Since each such interrupt can be associated with different VCPUs or none at all, associating a single VCPU state with such an interrupt does not capture the necessary semantics. The implementers of irq_set_affinity are the Intel and AMD IOMMUs, and the ARM GIC irqchip. The Intel and AMD callers do not appear to use percpu_devid interrupts, and the ARM GIC implementation only checks the pointer against NULL vs. non-NULL. Therefore, simply update the function documentation to explain the expected use in the context of percpu_devid interrupts, allowing future changes or additions to irqchip implementers to do the right thing. Signed-off-by: Christoffer Dall Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: kvm@vger.kernel.org Cc: Catalin Marinas Cc: Will Deacon Cc: Eric Auger Cc: kvmarm@lists.cs.columbia.edu Cc: linux-arm-kernel@lists.infradead.org Link: https://lkml.kernel.org/r/1509093281-15225-13-git-send-email-cdall@linaro.org --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e667912d0e9c..c65f282cbc9a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -381,7 +381,8 @@ int irq_select_affinity_usr(unsigned int irq) /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt * @irq: interrupt number to set affinity - * @vcpu_info: vCPU specific data + * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU + * specific data for percpu_devid interrupts * * This function uses the vCPU specific data to set the vCPU * affinity for an irq. The vCPU specific data is passed from -- cgit v1.2.3 From be96b316deff35e119760982c43af74e606fa143 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 28 Oct 2017 09:49:37 -0700 Subject: perf/cgroup: Fix perf cgroup hierarchy support The following commit: 864c2357ca89 ("perf/core: Do not set cpuctx->cgrp for unscheduled cgroups") made list_update_cgroup_event() skip setting cpuctx->cgrp if no cgroup event targets %current's cgroup. This breaks perf_event's hierarchical support because events which target one of the ancestors get ignored. Fix it by using cgroup_is_descendant() test instead of equality. Signed-off-by: Tejun Heo Acked-by: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Linus Torvalds Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: stable@vger.kernel.org # v4.9+ Fixes: 864c2357ca89 ("perf/core: Do not set cpuctx->cgrp for unscheduled cgroups") Link: http://lkml.kernel.org/r/20171028164237.GA972780@devbig577.frc2.facebook.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9d93db81fa36..10cdb9c26b5d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -901,9 +901,11 @@ list_update_cgroup_event(struct perf_event *event, cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ if (add) { + struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); - if (perf_cgroup_from_task(current, ctx) == event->cgrp) - cpuctx->cgrp = event->cgrp; + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) + cpuctx->cgrp = cgrp; } else { list_del(cpuctx_entry); cpuctx->cgrp = NULL; -- cgit v1.2.3 From bc8293663b953c23ff7b73eb15f82393425e5e47 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 22 Oct 2017 22:30:55 +0800 Subject: printk: fix typo in printk_safe.c Link: http://lkml.kernel.org/r/1508682655-27293-1-git-send-email-bhe@redhat.com Cc: rostedt@goodmis.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Baoquan He Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk_safe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 3cdaeaef9ce1..89558b85f45c 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -75,7 +75,7 @@ static void queue_flush_work(struct printk_safe_seq_buf *s) * have dedicated buffers, because otherwise printk-safe preempted by * NMI-printk would have overwritten the NMI messages. * - * The messages are fushed from irq work (or from panic()), possibly, + * The messages are flushed from irq work (or from panic()), possibly, * from other CPU, concurrently with printk_safe_log_store(). Should this * happen, printk_safe_log_store() will notice the buffer->len mismatch * and repeat the write. -- cgit v1.2.3 From cef572ad9bd7f85035ba8272e5352040e8be0152 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Sat, 28 Oct 2017 11:07:28 +0800 Subject: workqueue: Fix NULL pointer dereference When queue_work() is used in irq (not in task context), there is a potential case that trigger NULL pointer dereference. ---------------------------------------------------------------- worker_thread() |-spin_lock_irq() |-process_one_work() |-worker->current_pwq = pwq |-spin_unlock_irq() |-worker->current_func(work) |-spin_lock_irq() |-worker->current_pwq = NULL |-spin_unlock_irq() //interrupt here |-irq_handler |-__queue_work() //assuming that the wq is draining |-is_chained_work(wq) |-current_wq_worker() //Here, 'current' is the interrupted worker! |-current->current_pwq is NULL here! |-schedule() ---------------------------------------------------------------- Avoid it by checking for task context in current_wq_worker(), and if not in task context, we shouldn't use the 'current' to check the condition. Reported-by: Xiaofei Tan Signed-off-by: Li Bin Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo Fixes: 8d03ecfe4718 ("workqueue: reimplement is_chained_work() using current_wq_worker()") Cc: stable@vger.kernel.org # v3.9+ --- kernel/workqueue_internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 8635417c587b..29fa81f0f51a 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -9,6 +9,7 @@ #include #include +#include struct worker_pool; @@ -59,7 +60,7 @@ struct worker { */ static inline struct worker *current_wq_worker(void) { - if (current->flags & PF_WQ_WORKER) + if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } -- cgit v1.2.3 From c3ba13298709f46e72b22d087d0aa02bd012e4b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Oct 2017 08:13:14 -0700 Subject: cgroup: mark @cgrp __maybe_unused in cpu_stat_show() The local variable @cgrp isn't used if !CONFIG_CGROUP_SCHED. Mark the variable with __maybe_unused to avoid a compile warning. Reported-by: "kbuild-all@01.org" Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9773e49a1b4..d6ed725f36d9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3357,7 +3357,7 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, static int cpu_stat_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; + struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; cgroup_stat_show_cputime(seq); -- cgit v1.2.3 From 9afe77ed849de6af8532b4c1b9310102eed9edf7 Mon Sep 17 00:00:00 2001 From: Maxim Akristiniy Date: Mon, 23 Oct 2017 19:51:48 +0300 Subject: added new line symbol after warning about dropped messages so this message will not mess with the next one Cc: Steven Rostedt Signed-off-by: Maxim Akristiniy Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 512f7c2baedd..5d81206a572d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2190,7 +2190,7 @@ again: } if (console_seq < log_first_seq) { - len = sprintf(text, "** %u printk messages dropped ** ", + len = sprintf(text, "** %u printk messages dropped **\n", (unsigned)(log_first_seq - console_seq)); /* messages are gone, move to first one */ -- cgit v1.2.3 From 0f295b0650c90362b4111f46d7f9149a0a4191be Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 13 Oct 2017 11:54:33 -0600 Subject: rtc: Allow rtc drivers to specify the tv_nsec value for ntp ntp is currently hardwired to try and call the rtc set when wall clock tv_nsec is 0.5 seconds. This historical behaviour works well with certain PC RTCs, but is not universal to all rtc hardware. Change how this works by introducing the driver specific concept of set_offset_nsec, the delay between current wall clock time and the target time to set (with a 0 tv_nsecs). For x86-style CMOS set_offset_nsec should be -0.5 s which causes the last second to be written 0.5 s after it has started. For compat with the old rtc_set_ntp_time, the value is defaulted to + 0.5 s, which causes the next second to be written 0.5s before it starts, as things were before this patch. Testing shows many non-x86 RTCs would like set_offset_nsec ~= 0, so ultimately each RTC driver should set the set_offset_nsec according to its needs, and non x86 architectures should stop using update_persistent_clock64 in order to access this feature. Future patches will revise the drivers as needed. Since CMOS and RTC now have very different handling they are split into two dedicated code paths, sharing the support code, and ifdefs are replaced with IS_ENABLED. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Jason Gunthorpe Signed-off-by: John Stultz --- kernel/time/ntp.c | 166 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 113 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index edf19cc53140..bc19de1a0683 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -492,6 +492,67 @@ out: return leap; } +static void sync_hw_clock(struct work_struct *work); +static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock); + +static void sched_sync_hw_clock(struct timespec64 now, + unsigned long target_nsec, bool fail) + +{ + struct timespec64 next; + + getnstimeofday64(&next); + if (!fail) + next.tv_sec = 659; + else { + /* + * Try again as soon as possible. Delaying long periods + * decreases the accuracy of the work queue timer. Due to this + * the algorithm is very likely to require a short-sleep retry + * after the above long sleep to synchronize ts_nsec. + */ + next.tv_sec = 0; + } + + /* Compute the needed delay that will get to tv_nsec == target_nsec */ + next.tv_nsec = target_nsec - next.tv_nsec; + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + + queue_delayed_work(system_power_efficient_wq, &sync_work, + timespec64_to_jiffies(&next)); +} + +static void sync_rtc_clock(void) +{ + unsigned long target_nsec; + struct timespec64 adjust, now; + int rc; + + if (!IS_ENABLED(CONFIG_RTC_SYSTOHC)) + return; + + getnstimeofday64(&now); + + adjust = now; + if (persistent_clock_is_local) + adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); + + /* + * The current RTC in use will provide the target_nsec it wants to be + * called at, and does rtc_tv_nsec_ok internally. + */ + rc = rtc_set_ntp_time(adjust, &target_nsec); + if (rc == -ENODEV) + return; + + sched_sync_hw_clock(now, target_nsec, rc); +} + #ifdef CONFIG_GENERIC_CMOS_UPDATE int __weak update_persistent_clock(struct timespec now) { @@ -507,76 +568,75 @@ int __weak update_persistent_clock64(struct timespec64 now64) } #endif -#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) -static void sync_cmos_clock(struct work_struct *work); - -static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); - -static void sync_cmos_clock(struct work_struct *work) +static bool sync_cmos_clock(void) { + static bool no_cmos; struct timespec64 now; - struct timespec64 next; - int fail = 1; + struct timespec64 adjust; + int rc = -EPROTO; + long target_nsec = NSEC_PER_SEC / 2; + + if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE)) + return false; + + if (no_cmos) + return false; /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - * We want the clock to be within a couple of ticks from the target. + * Historically update_persistent_clock64() has followed x86 + * semantics, which match the MC146818A/etc RTC. This RTC will store + * 'adjust' and then in .5s it will advance once second. + * + * Architectures are strongly encouraged to use rtclib and not + * implement this legacy API. */ - if (!ntp_synced()) { - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - } - getnstimeofday64(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec64 adjust = now; - - fail = -ENODEV; + if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) { if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); -#ifdef CONFIG_GENERIC_CMOS_UPDATE - fail = update_persistent_clock64(adjust); -#endif - -#ifdef CONFIG_RTC_SYSTOHC - if (fail == -ENODEV) - fail = rtc_set_ntp_time(adjust); -#endif + rc = update_persistent_clock64(adjust); + /* + * The machine does not support update_persistent_clock64 even + * though it defines CONFIG_GENERIC_CMOS_UPDATE. + */ + if (rc == -ENODEV) { + no_cmos = true; + return false; + } } - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); - if (next.tv_nsec <= 0) - next.tv_nsec += NSEC_PER_SEC; + sched_sync_hw_clock(now, target_nsec, rc); + return true; +} - if (!fail || fail == -ENODEV) - next.tv_sec = 659; - else - next.tv_sec = 0; +/* + * If we have an externally synchronized Linux clock, then update RTC clock + * accordingly every ~11 minutes. Generally RTCs can only store second + * precision, but many RTCs will adjust the phase of their second tick to + * match the moment of update. This infrastructure arranges to call to the RTC + * set at the correct moment to phase synchronize the RTC second tick over + * with the kernel clock. + */ +static void sync_hw_clock(struct work_struct *work) +{ + if (!ntp_synced()) + return; - if (next.tv_nsec >= NSEC_PER_SEC) { - next.tv_sec++; - next.tv_nsec -= NSEC_PER_SEC; - } - queue_delayed_work(system_power_efficient_wq, - &sync_cmos_work, timespec64_to_jiffies(&next)); + if (sync_cmos_clock()) + return; + + sync_rtc_clock(); } void ntp_notify_cmos_timer(void) { - queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); -} - -#else -void ntp_notify_cmos_timer(void) { } -#endif + if (!ntp_synced()) + return; + if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) || + IS_ENABLED(CONFIG_RTC_SYSTOHC)) + queue_delayed_work(system_power_efficient_wq, &sync_work, 0); +} /* * Propagate a new txc->status value into the NTP state: -- cgit v1.2.3 From e0956dcc4ba74ec4b17e32fc9a156fcba1ef6610 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:44 +0200 Subject: timekeeping: Consolidate timekeeping_inject_offset code The code to check the adjtimex() or clock_adjtime() arguments is spread out across multiple files for presumably only historic reasons. As a preparatation for a rework to get rid of the use of 'struct timeval' and 'struct timespec' in there, this moves all the portions into kernel/time/timekeeping.c and marks them as 'static'. The warp_clock() function here is not as closely related as the others, but I feel it still makes sense to move it here in order to consolidate all callers of timekeeping_inject_offset(). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann [jstultz: Whitespace fixup] Signed-off-by: John Stultz --- kernel/time/ntp.c | 61 ---------------------- kernel/time/ntp_internal.h | 1 - kernel/time/time.c | 36 +------------ kernel/time/timekeeping.c | 123 ++++++++++++++++++++++++++++++++++++++++++++- kernel/time/timekeeping.h | 2 +- 5 files changed, 123 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bc19de1a0683..90f84582a076 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -713,67 +713,6 @@ static inline void process_adjtimex_modes(struct timex *txc, } - -/** - * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex - */ -int ntp_validate_timex(struct timex *txc) -{ - if (txc->modes & ADJ_ADJTIME) { - /* singleshot must not be used with any other mode bits */ - if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) - return -EINVAL; - if (!(txc->modes & ADJ_OFFSET_READONLY) && - !capable(CAP_SYS_TIME)) - return -EPERM; - } else { - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - /* - * if the quartz is off by more than 10% then - * something is VERY wrong! - */ - if (txc->modes & ADJ_TICK && - (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ)) - return -EINVAL; - } - - if (txc->modes & ADJ_SETOFFSET) { - /* In order to inject time, you gotta be super-user! */ - if (!capable(CAP_SYS_TIME)) - return -EPERM; - - if (txc->modes & ADJ_NANO) { - struct timespec ts; - - ts.tv_sec = txc->time.tv_sec; - ts.tv_nsec = txc->time.tv_usec; - if (!timespec_inject_offset_valid(&ts)) - return -EINVAL; - - } else { - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; - } - } - - /* - * Check for potential multiplication overflows that can - * only happen on 64-bit systems: - */ - if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { - if (LLONG_MIN / PPM_SCALE > txc->freq) - return -EINVAL; - if (LLONG_MAX / PPM_SCALE < txc->freq) - return -EINVAL; - } - - return 0; -} - - /* * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index d8a7c11fa71a..74b52cd48209 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -7,7 +7,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec64 *, const struct timespec64 *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/time.c b/kernel/time/time.c index 44a8c1402133..04684e294f00 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -157,40 +157,6 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, return 0; } -/* - * Indicates if there is an offset between the system clock and the hardware - * clock/persistent clock/rtc. - */ -int persistent_clock_is_local; - -/* - * Adjust the time obtained from the CMOS to be UTC time instead of - * local time. - * - * This is ugly, but preferable to the alternatives. Otherwise we - * would either need to write a program to do it in /etc/rc (and risk - * confusion if the program gets run more than once; it would also be - * hard to make the program warp the clock precisely n hours) or - * compile in the timezone information into the kernel. Bad, bad.... - * - * - TYT, 1992-01-01 - * - * The best thing to do is to keep the CMOS clock in universal time (UTC) - * as real UNIX machines always do it. This avoids all headaches about - * daylight saving times and warping kernel clocks. - */ -static inline void warp_clock(void) -{ - if (sys_tz.tz_minuteswest != 0) { - struct timespec adjust; - - persistent_clock_is_local = 1; - adjust.tv_sec = sys_tz.tz_minuteswest * 60; - adjust.tv_nsec = 0; - timekeeping_inject_offset(&adjust); - } -} - /* * In case for some reason the CMOS clock has not already been running * in UTC, but in some local time: The first time we set the timezone, @@ -224,7 +190,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz if (firsttime) { firsttime = 0; if (!tv) - warp_clock(); + timekeeping_warp_clock(); } } if (tv) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2cafb49aa65e..7d8e0e842484 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1258,13 +1258,39 @@ out: } EXPORT_SYMBOL(do_settimeofday64); +/* + * Validates if a timespec/timeval used to inject a time offset is valid. + * Offsets can be postive or negative. The value of the timeval/timespec + * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must + * always be non-negative. + */ +static inline bool timeval_inject_offset_valid(const struct timeval *tv) +{ + /* We don't check the tv_sec as it can be positive or negative */ + + /* Can't have more microseconds then a second */ + if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) + return false; + return true; +} + +static inline bool timespec_inject_offset_valid(const struct timespec *ts) +{ + /* We don't check the tv_sec as it can be positive or negative */ + + /* Can't have more nanoseconds then a second */ + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) + return false; + return true; +} + /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -int timekeeping_inject_offset(struct timespec *ts) +static int timekeeping_inject_offset(struct timespec *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; @@ -1303,7 +1329,40 @@ error: /* even if we error out, we forwarded the time, so call update */ return ret; } -EXPORT_SYMBOL(timekeeping_inject_offset); + +/* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +void timekeeping_warp_clock(void) +{ + if (sys_tz.tz_minuteswest != 0) { + struct timespec adjust; + + persistent_clock_is_local = 1; + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } +} /** * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic @@ -2247,6 +2306,66 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, return base; } +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex + */ +static int ntp_validate_timex(struct timex *txc) +{ + if (txc->modes & ADJ_ADJTIME) { + /* singleshot must not be used with any other mode bits */ + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) + return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + } + + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } + } + + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LLONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + + return 0; +} + + /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index d0914676d4c5..44aec7893cdd 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -10,7 +10,7 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); -extern int timekeeping_inject_offset(struct timespec *ts); +extern void timekeeping_warp_clock(void); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); -- cgit v1.2.3 From 1572fa03784831b81ec26ec379374cf6bdec04fb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:45 +0200 Subject: timekeeping: Use timespec64 in timekeeping_inject_offset As part of changing all the timekeeping code to use 64-bit time_t consistently, this removes the uses of timeval and timespec as much as possible from do_adjtimex() and timekeeping_inject_offset(). The timeval_inject_offset_valid() and timespec_inject_offset_valid() just complicate this, so I'm folding them into the respective callers. This leaves the actual 'struct timex' definition, which is part of the user-space ABI and should be dealt with separately when we have agreed on the ABI change. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 72 ++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7d8e0e842484..c6a35fb3cf76 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1258,65 +1258,37 @@ out: } EXPORT_SYMBOL(do_settimeofday64); -/* - * Validates if a timespec/timeval used to inject a time offset is valid. - * Offsets can be postive or negative. The value of the timeval/timespec - * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must - * always be non-negative. - */ -static inline bool timeval_inject_offset_valid(const struct timeval *tv) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - return true; -} - -static inline bool timespec_inject_offset_valid(const struct timespec *ts) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more nanoseconds then a second */ - if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(struct timespec *ts) +static int timekeeping_inject_offset(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec64 ts64, tmp; + struct timespec64 tmp; int ret = 0; - if (!timespec_inject_offset_valid(ts)) + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - ts64 = timespec_to_timespec64(*ts); - raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tk), ts64); - if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 || + tmp = timespec64_add(tk_xtime(tk), *ts); + if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || !timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } - tk_xtime_add(tk, &ts64); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1355,7 +1327,7 @@ int persistent_clock_is_local; void timekeeping_warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { - struct timespec adjust; + struct timespec64 adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; @@ -2307,9 +2279,9 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, } /** - * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex + * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int ntp_validate_timex(struct timex *txc) +static int timekeeping_validate_timex(struct timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2337,16 +2309,22 @@ static int ntp_validate_timex(struct timex *txc) if (!capable(CAP_SYS_TIME)) return -EPERM; - if (txc->modes & ADJ_NANO) { - struct timespec ts; + /* + * Validate if a timespec/timeval used to inject a time + * offset is valid. Offsets can be postive or negative, so + * we don't check tv_sec. The value of the timeval/timespec + * is the sum of its fields,but *NOTE*: + * The field tv_usec/tv_nsec must always be non-negative and + * we can't have more nanoseconds/microseconds than a second. + */ + if (txc->time.tv_usec < 0) + return -EINVAL; - ts.tv_sec = txc->time.tv_sec; - ts.tv_nsec = txc->time.tv_usec; - if (!timespec_inject_offset_valid(&ts)) + if (txc->modes & ADJ_NANO) { + if (txc->time.tv_usec >= NSEC_PER_SEC) return -EINVAL; - } else { - if (!timeval_inject_offset_valid(&txc->time)) + if (txc->time.tv_usec >= USEC_PER_SEC) return -EINVAL; } } @@ -2378,12 +2356,12 @@ int do_adjtimex(struct timex *txc) int ret; /* Validate the data before disabling interrupts */ - ret = ntp_validate_timex(txc); + ret = timekeeping_validate_timex(txc); if (ret) return ret; if (txc->modes & ADJ_SETOFFSET) { - struct timespec delta; + struct timespec64 delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) -- cgit v1.2.3 From 85bf19e7df2479140eff2348a4e6a9c19b5c3960 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:46 +0200 Subject: time: Remove unused functions The (slow but) ongoing work on conversion from timespec to timespec64 has led some timespec based helper functions to become unused. No new code should use them, so we can remove the functions entirely. I'm planning to obsolete additional interfaces next and remove more of these. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/time.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 04684e294f00..947fb614c78f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -818,24 +818,6 @@ unsigned long nsecs_to_jiffies(u64 n) } EXPORT_SYMBOL_GPL(nsecs_to_jiffies); -/* - * Add two timespec values and do a safety check for overflow. - * It's assumed that both values are valid (>= 0) - */ -struct timespec timespec_add_safe(const struct timespec lhs, - const struct timespec rhs) -{ - struct timespec res; - - set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - - if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) - res.tv_sec = TIME_T_MAX; - - return res; -} - /* * Add two timespec64 values and do a safety check for overflow. * It's assumed that both values are valid (>= 0). -- cgit v1.2.3 From abc8f96e3eb846fcf6333395ee1f6ed4a734576c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:48 +0200 Subject: time: Move time_t conversion helpers to time32.h On 64-bit architectures, the timespec64 based helpers in linux/time.h are defined as macros pointing to their timespec based counterparts. This made sense when they were first introduced, but as we are migrating away from timespec in general, it's much less intuitive now. This changes the macros to work in the exact opposite way: we always provide the timespec64 based helpers and define the old interfaces as macros for them. Now we can move those macros into linux/time32.h, which already contains the respective helpers for 32-bit architectures. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/time.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 947fb614c78f..fe60ebd301cf 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -407,6 +407,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); +#if __BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -467,6 +468,7 @@ struct timespec ns_to_timespec(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec); +#endif /** * ns_to_timeval - Convert nanoseconds to timeval @@ -486,7 +488,6 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); -#if BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -547,7 +548,7 @@ struct timespec64 ns_to_timespec64(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec64); -#endif + /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds -- cgit v1.2.3 From 16c0890dc66d258fdeccf7b15a133f3930b19143 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 31 Oct 2017 02:46:54 +0100 Subject: irq/work: Don't reinvent the wheel but use existing llist API Use the proper llist APIs instead of open-coded variants of them. Signed-off-by: Byungchul Park Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1509414414-14987-1-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index bcf107ce0854..e2ebe8c71e8f 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -138,11 +138,7 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - while (llnode != NULL) { - work = llist_entry(llnode, struct irq_work, llnode); - - llnode = llist_next(llnode); - + llist_for_each_entry(work, llnode, llnode) { /* * Clear the PENDING bit, after this point the @work * can be re-used. -- cgit v1.2.3 From ab97f87325e28b7ef7717e6cb62e8da14a7176e1 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 20 Oct 2017 13:26:02 +0300 Subject: fsnotify: convert fsnotify_mark.refcnt from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable fsnotify_mark.refcnt is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: Jan Kara --- kernel/audit_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 011d46e5f73f..45ec960ad536 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1007,7 +1007,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify * We are guaranteed to have at least one reference to the mark from * either the inode or the caller of fsnotify_destroy_mark(). */ - BUG_ON(atomic_read(&entry->refcnt) < 1); + BUG_ON(refcount_read(&entry->refcnt) < 1); } static const struct fsnotify_ops audit_tree_ops = { -- cgit v1.2.3 From aa4bf44dc851c6bdd4f7b61b5f2c56c84dfe2ff0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 25 Oct 2017 00:04:40 +0200 Subject: userns: use union in {g,u}idmap struct - Add a struct containing two pointer to extents and wrap both the static extent array and the struct into a union. This is done in preparation for bumping the {g,u}idmap limits for user namespaces. - Add brackets around anonymous union when using designated initializers to initialize members in order to please gcc <= 4.4. Signed-off-by: Christian Brauner Signed-off-by: Eric W. Biederman --- kernel/user.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 00281add65b2..9a20acce460d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -26,26 +26,32 @@ struct user_namespace init_user_ns = { .uid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .gid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .projid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .count = ATOMIC_INIT(3), -- cgit v1.2.3 From 6397fac4915ab3002dc15aae751455da1a852f25 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 25 Oct 2017 00:04:41 +0200 Subject: userns: bump idmap limits to 340 There are quite some use cases where users run into the current limit for {g,u}id mappings. Consider a user requesting us to map everything but 999, and 1001 for a given range of 1000000000 with a sub{g,u}id layout of: some-user:100000:1000000000 some-user:999:1 some-user:1000:1 some-user:1001:1 some-user:1002:1 This translates to: MAPPING-TYPE | CONTAINER | HOST | RANGE | -------------|-----------|---------|-----------| uid | 999 | 999 | 1 | uid | 1001 | 1001 | 1 | uid | 0 | 1000000 | 999 | uid | 1000 | 1001000 | 1 | uid | 1002 | 1001002 | 999998998 | ------------------------------------------------ gid | 999 | 999 | 1 | gid | 1001 | 1001 | 1 | gid | 0 | 1000000 | 999 | gid | 1000 | 1001000 | 1 | gid | 1002 | 1001002 | 999998998 | which is already the current limit. As discussed at LPC simply bumping the number of limits is not going to work since this would mean that struct uid_gid_map won't fit into a single cache-line anymore thereby regressing performance for the base-cases. The same problem seems to arise when using a single pointer. So the idea is to use struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; For the base cases we will only use the struct uid_gid_extent extent member. If we go over UID_GID_MAP_MAX_BASE_EXTENTS mappings we perform a single 4k kmalloc() which means we can have a maximum of 340 mappings (340 * size(struct uid_gid_extent) = 4080). For the latter case we use two pointers "forward" and "reverse". The forward pointer points to an array sorted by "first" and the reverse pointer points to an array sorted by "lower_first". We can then perform binary search on those arrays. Performance Testing: When Eric introduced the extent-based struct uid_gid_map approach he measured the performanc impact of his idmap changes: > My benchmark consisted of going to single user mode where nothing else was > running. On an ext4 filesystem opening 1,000,000 files and looping through all > of the files 1000 times and calling fstat on the individuals files. This was > to ensure I was benchmarking stat times where the inodes were in the kernels > cache, but the inode values were not in the processors cache. My results: > v3.4-rc1: ~= 156ns (unmodified v3.4-rc1 with user namespace support disabled) > v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user namespace support disabled) > v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user namespace support enabled) I used an identical approach on my laptop. Here's a thorough description of what I did. I built a 4.14.0-rc4 mainline kernel with my new idmap patches applied. I booted into single user mode and used an ext4 filesystem to open/create 1,000,000 files. Then I looped through all of the files calling fstat() on each of them 1000 times and calculated the mean fstat() time for a single file. (The test program can be found below.) Here are the results. For fun, I compared the first version of my patch which scaled linearly with the new version of the patch: | # MAPPINGS | PATCH-V1 | PATCH-NEW | |--------------|------------|-----------| | 0 mappings | 158 ns | 158 ns | | 1 mappings | 164 ns | 157 ns | | 2 mappings | 170 ns | 158 ns | | 3 mappings | 175 ns | 161 ns | | 5 mappings | 187 ns | 165 ns | | 10 mappings | 218 ns | 199 ns | | 50 mappings | 528 ns | 218 ns | | 100 mappings | 980 ns | 229 ns | | 200 mappings | 1880 ns | 239 ns | | 300 mappings | 2760 ns | 240 ns | | 340 mappings | not tested | 248 ns | Here's the test program I used. I asked Eric what he did and this is a more "advanced" implementation of the idea. It's pretty straight-forward: #define __GNU_SOURCE #define __STDC_FORMAT_MACROS #include #include #include #include #include #include #include #include #include #include #include int main(int argc, char *argv[]) { int ret; size_t i, k; int fd[1000000]; int times[1000]; char pathname[4096]; struct stat st; struct timeval t1, t2; uint64_t time_in_mcs; uint64_t sum = 0; if (argc != 2) { fprintf(stderr, "Please specify a directory where to create " "the test files\n"); exit(EXIT_FAILURE); } for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) { sprintf(pathname, "%s/idmap_test_%zu", argv[1], i); fd[i]= open(pathname, O_RDWR | O_CREAT, S_IXUSR | S_IXGRP | S_IXOTH); if (fd[i] < 0) { ssize_t j; for (j = i; j >= 0; j--) close(fd[j]); exit(EXIT_FAILURE); } } for (k = 0; k < 1000; k++) { ret = gettimeofday(&t1, NULL); if (ret < 0) goto close_all; for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) { ret = fstat(fd[i], &st); if (ret < 0) goto close_all; } ret = gettimeofday(&t2, NULL); if (ret < 0) goto close_all; time_in_mcs = (1000000 * t2.tv_sec + t2.tv_usec) - (1000000 * t1.tv_sec + t1.tv_usec); printf("Total time in micro seconds: %" PRIu64 "\n", time_in_mcs); printf("Total time in nanoseconds: %" PRIu64 "\n", time_in_mcs * 1000); printf("Time per file in nanoseconds: %" PRIu64 "\n", (time_in_mcs * 1000) / 1000000); times[k] = (time_in_mcs * 1000) / 1000000; } close_all: for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) close(fd[i]); if (ret < 0) exit(EXIT_FAILURE); for (k = 0; k < 1000; k++) { sum += times[k]; } printf("Mean time per file in nanoseconds: %" PRIu64 "\n", sum / 1000); exit(EXIT_SUCCESS);; } Signed-off-by: Christian Brauner CC: Serge Hallyn CC: Eric Biederman Signed-off-by: Eric W. Biederman --- kernel/user_namespace.c | 350 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 320 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..5fd2d53dbc75 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); @@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->gid_map.forward); + kfree(ns->gid_map.reverse); + } + if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->uid_map.forward); + kfree(ns->uid_map.reverse); + } + if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->projid_map.forward); + kfree(ns->projid_map.reverse); + } retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); @@ -198,7 +212,84 @@ void __put_user_ns(struct user_namespace *ns) } EXPORT_SYMBOL(__put_user_ns); -static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +/** + * idmap_key struct holds the information necessary to find an idmapping in a + * sorted idmap array. It is passed to cmp_map_id() as first argument. + */ +struct idmap_key { + bool map_up; /* true -> id from kid; false -> kid from id */ + u32 id; /* id to find */ + u32 count; /* == 0 unless used with map_id_range_down() */ +}; + +/** + * cmp_map_id - Function to be passed to bsearch() to find the requested + * idmapping. Expects struct idmap_key to be passed via @k. + */ +static int cmp_map_id(const void *k, const void *e) +{ + u32 first, last, id2; + const struct idmap_key *key = k; + const struct uid_gid_extent *el = e; + + /* handle map_id_range_down() */ + if (key->count) + id2 = key->id + key->count - 1; + else + id2 = key->id; + + /* handle map_id_{down,up}() */ + if (key->map_up) + first = el->lower_first; + else + first = el->first; + + last = first + el->count - 1; + + if (key->id >= first && key->id <= last && + (id2 >= first && id2 <= last)) + return 0; + + if (key->id < first || id2 < first) + return -1; + + return 1; +} + +/** + * map_id_range_down_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) +{ + u32 extents; + struct uid_gid_extent *extent; + struct idmap_key key; + + key.map_up = false; + key.count = count; + key.id = id; + + extents = map->nr_extents; + smp_rmb(); + + extent = bsearch(&key, map->forward, extents, + sizeof(struct uid_gid_extent), cmp_map_id); + /* Map the id or note failure */ + if (extent) + id = (id - extent->first) + extent->lower_first; + else + id = (u32) -1; + + return id; +} + +/** + * map_id_range_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count) { unsigned idx, extents; u32 first, last, id2; @@ -224,7 +315,23 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -static u32 map_id_down(struct uid_gid_map *map, u32 id) +static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_range_down_base(map, id, count); + + return map_id_range_down_max(map, id, count); +} + +/** + * map_id_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_down_base(struct uid_gid_map *map, u32 id) { unsigned idx, extents; u32 first, last; @@ -247,7 +354,23 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) return id; } -static u32 map_id_up(struct uid_gid_map *map, u32 id) +static u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_down_base(map, id); + + return map_id_range_down_max(map, id, 0); +} + +/** + * map_id_up_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_up_base(struct uid_gid_map *map, u32 id) { unsigned idx, extents; u32 first, last; @@ -270,6 +393,45 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) return id; } +/** + * map_id_up_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_up_max(struct uid_gid_map *map, u32 id) +{ + u32 extents; + struct uid_gid_extent *extent; + struct idmap_key key; + + key.map_up = true; + key.count = 0; + key.id = id; + + extents = map->nr_extents; + smp_rmb(); + + extent = bsearch(&key, map->reverse, extents, + sizeof(struct uid_gid_extent), cmp_map_id); + /* Map the id or note failure */ + if (extent) + id = (id - extent->lower_first) + extent->first; + else + id = (u32) -1; + + return id; +} + +static u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_up_base(map, id); + + return map_id_up_max(map, id); +} + /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in @@ -540,13 +702,15 @@ static int projid_m_show(struct seq_file *seq, void *v) static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { - struct uid_gid_extent *extent = NULL; loff_t pos = *ppos; - if (pos < map->nr_extents) - extent = &map->extent[pos]; + if (pos >= map->nr_extents) + return NULL; + + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return &map->extent[pos]; - return extent; + return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) @@ -618,7 +782,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map, u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; - prev = &new_map->extent[idx]; + if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + prev = &new_map->extent[idx]; + else + prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; @@ -638,6 +805,102 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } +/** + * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. + * Takes care to allocate a 4K block of memory if the number of mappings exceeds + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) +{ + if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) { + map->extent[map->nr_extents].first = extent->first; + map->extent[map->nr_extents].lower_first = extent->lower_first; + map->extent[map->nr_extents].count = extent->count; + return 0; + } + + if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { + struct uid_gid_extent *forward; + + /* Allocate memory for 340 mappings. */ + forward = kmalloc(sizeof(struct uid_gid_extent) * + UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL); + if (!forward) + return -ENOMEM; + + /* Copy over memory. Only set up memory for the forward pointer. + * Defer the memory setup for the reverse pointer. + */ + memcpy(forward, map->extent, + map->nr_extents * sizeof(map->extent[0])); + + map->forward = forward; + map->reverse = NULL; + } + + map->forward[map->nr_extents].first = extent->first; + map->forward[map->nr_extents].lower_first = extent->lower_first; + map->forward[map->nr_extents].count = extent->count; + return 0; +} + +/* cmp function to sort() forward mappings */ +static int cmp_extents_forward(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->first < e2->first) + return -1; + + if (e1->first > e2->first) + return 1; + + return 0; +} + +/* cmp function to sort() reverse mappings */ +static int cmp_extents_reverse(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->lower_first < e2->lower_first) + return -1; + + if (e1->lower_first > e2->lower_first) + return 1; + + return 0; +} + +/** + * sort_idmaps - Sorts an array of idmap entries. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int sort_idmaps(struct uid_gid_map *map) +{ + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return 0; + + /* Sort forward array. */ + sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_forward, NULL); + + /* Only copy the memory from forward we actually need. */ + map->reverse = kmemdup(map->forward, + map->nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL); + if (!map->reverse) + return -ENOMEM; + + /* Sort reverse array. */ + sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_reverse, NULL); + + return 0; +} + static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -648,7 +911,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; - struct uid_gid_extent *extent = NULL; + struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret = -EINVAL; @@ -673,6 +936,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, */ mutex_lock(&userns_state_mutex); + memset(&new_map, 0, sizeof(struct uid_gid_map)); + ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) @@ -700,9 +965,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Parse the user data */ ret = -EINVAL; pos = kbuf; - new_map.nr_extents = 0; for (; pos; pos = next_line) { - extent = &new_map.extent[new_map.nr_extents]; /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); @@ -714,17 +977,17 @@ static ssize_t map_write(struct file *file, const char __user *buf, } pos = skip_spaces(pos); - extent->first = simple_strtoul(pos, &pos, 10); + extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->lower_first = simple_strtoul(pos, &pos, 10); + extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->count = simple_strtoul(pos, &pos, 10); + extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; @@ -734,29 +997,33 @@ static ssize_t map_write(struct file *file, const char __user *buf, goto out; /* Verify we have been given valid starting values */ - if ((extent->first == (u32) -1) || - (extent->lower_first == (u32) -1)) + if ((extent.first == (u32) -1) || + (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ - if ((extent->first + extent->count) <= extent->first) + if ((extent.first + extent.count) <= extent.first) goto out; - if ((extent->lower_first + extent->count) <= - extent->lower_first) + if ((extent.lower_first + extent.count) <= + extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ - if (mappings_overlap(&new_map, extent)) + if (mappings_overlap(&new_map, &extent)) goto out; - new_map.nr_extents++; - - /* Fail if the file contains too many extents */ - if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && + if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; + + ret = insert_extent(&new_map, &extent); + if (ret < 0) + goto out; + ret = -EINVAL; + + new_map.nr_extents++; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) @@ -767,16 +1034,26 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; + ret = sort_idmaps(&new_map); + if (ret < 0) + goto out; + + ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { + struct uid_gid_extent *e; u32 lower_first; - extent = &new_map.extent[idx]; + + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + e = &new_map.extent[idx]; + else + e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, - extent->lower_first, - extent->count); + e->lower_first, + e->count); /* Fail if we can not map the specified extent to * the kernel global id space. @@ -784,18 +1061,31 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (lower_first == (u32) -1) goto out; - extent->lower_first = lower_first; + e->lower_first = lower_first; } /* Install the map */ - memcpy(map->extent, new_map.extent, - new_map.nr_extents*sizeof(new_map.extent[0])); + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + memcpy(map->extent, new_map.extent, + new_map.nr_extents * sizeof(new_map.extent[0])); + } else { + map->forward = new_map.forward; + map->reverse = new_map.reverse; + } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: + if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(new_map.forward); + kfree(new_map.reverse); + map->forward = NULL; + map->reverse = NULL; + map->nr_extents = 0; + } + mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; -- cgit v1.2.3 From 11a8b9270e16e36d5fb607ba4b60db2958b7c625 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 15:54:32 -0500 Subject: userns: Don't special case a count of 0 We can always use a count of 1 so there is no reason to have a special case of a count of 0. Signed-off-by: Eric W. Biederman --- kernel/user_namespace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 5fd2d53dbc75..c9904ee084c4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -232,11 +232,7 @@ static int cmp_map_id(const void *k, const void *e) const struct idmap_key *key = k; const struct uid_gid_extent *el = e; - /* handle map_id_range_down() */ - if (key->count) - id2 = key->id + key->count - 1; - else - id2 = key->id; + id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) @@ -362,7 +358,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return map_id_down_base(map, id); - return map_id_range_down_max(map, id, 0); + return map_id_range_down_max(map, id, 1); } /** @@ -404,7 +400,7 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id) struct idmap_key key; key.map_up = true; - key.count = 0; + key.count = 1; key.id = id; extents = map->nr_extents; -- cgit v1.2.3 From 3edf652fa16562fb57a5a4b996ba72e2d7cdc38b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 16:27:29 -0500 Subject: userns: Simplify the user and group mapping functions Consolidate reading the number of extents and computing the return value in the map_id_down, map_id_range_down and map_id_range. This removal of one read of extents makes one smp_rmb unnecessary and makes the code safe it is executed during the map write. Reading the number of extents twice and depending on the result being the same is not safe, as it could be 0 the first time and > 5 the second time, which would lead to misinterpreting the union fields. The consolidation of the return value just removes a duplicate caluculation which should make it easier to understand and maintain the code. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 132 +++++++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c9904ee084c4..563a2981d7c7 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -256,28 +256,17 @@ static int cmp_map_id(const void *k, const void *e) * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) +static struct uid_gid_extent * +map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { - u32 extents; - struct uid_gid_extent *extent; struct idmap_key key; key.map_up = false; key.count = count; key.id = id; - extents = map->nr_extents; - smp_rmb(); - - extent = bsearch(&key, map->forward, extents, - sizeof(struct uid_gid_extent), cmp_map_id); - /* Map the id or note failure */ - if (extent) - id = (id - extent->first) + extent->lower_first; - else - id = (u32) -1; - - return id; + return bsearch(&key, map->forward, extents, + sizeof(struct uid_gid_extent), cmp_map_id); } /** @@ -285,41 +274,43 @@ static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count) +static struct uid_gid_extent * +map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { - unsigned idx, extents; + unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) - break; + return &map->extent[idx]; } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].lower_first; - else - id = (u32) -1; - - return id; + return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { - u32 extents = map->nr_extents; + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - return map_id_range_down_base(map, id, count); + extent = map_id_range_down_base(extents, map, id, count); + else + extent = map_id_range_down_max(extents, map, id, count); - return map_id_range_down_max(map, id, count); + /* Map the id or note failure */ + if (extent) + id = (id - extent->first) + extent->lower_first; + else + id = (u32) -1; + + return id; } /** @@ -327,38 +318,40 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_down_base(struct uid_gid_map *map, u32 id) +static struct uid_gid_extent * +map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id) { - unsigned idx, extents; + unsigned idx; u32 first, last; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) - break; + return &map->extent[idx]; } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].lower_first; - else - id = (u32) -1; - - return id; + return NULL; } static u32 map_id_down(struct uid_gid_map *map, u32 id) { - u32 extents = map->nr_extents; + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - return map_id_down_base(map, id); + extent = map_id_down_base(extents, map, id); + else + extent = map_id_range_down_max(extents, map, id, 1); - return map_id_range_down_max(map, id, 1); + /* Map the id or note failure */ + if (extent) + id = (id - extent->first) + extent->lower_first; + else + id = (u32) -1; + + return id; } /** @@ -366,48 +359,50 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_up_base(struct uid_gid_map *map, u32 id) +static struct uid_gid_extent * +map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { - unsigned idx, extents; + unsigned idx; u32 first, last; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) - break; + return &map->extent[idx]; } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].first; - else - id = (u32) -1; - - return id; + return NULL; } /** * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_up_max(struct uid_gid_map *map, u32 id) +static struct uid_gid_extent * +map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { - u32 extents; - struct uid_gid_extent *extent; struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; - extents = map->nr_extents; + return bsearch(&key, map->reverse, extents, + sizeof(struct uid_gid_extent), cmp_map_id); +} + +static u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); - extent = bsearch(&key, map->reverse, extents, - sizeof(struct uid_gid_extent), cmp_map_id); + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = map_id_up_base(extents, map, id); + else + extent = map_id_up_max(extents, map, id); + /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; @@ -417,17 +412,6 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id) return id; } -static u32 map_id_up(struct uid_gid_map *map, u32 id) -{ - u32 extents = map->nr_extents; - smp_rmb(); - - if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - return map_id_up_base(map, id); - - return map_id_up_max(map, id); -} - /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in -- cgit v1.2.3 From d5e7b3c5f51fc6d34e12b6d87bfd30ab277c4625 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 17:09:34 -0500 Subject: userns: Don't read extents twice in m_start This is important so reading /proc//{uid_map,gid_map,projid_map} while the map is being written does not do strange things. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 563a2981d7c7..4f7e357ac1e2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -683,11 +683,13 @@ static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; + unsigned extents = map->nr_extents; + smp_rmb(); - if (pos >= map->nr_extents) + if (pos >= extents) return NULL; - if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; -- cgit v1.2.3 From ece66133979b211324cc6aff9285889b425243d2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 16:53:09 -0500 Subject: userns: Make map_id_down a wrapper for map_id_range_down There is no good reason for this code duplication, the number of cache line accesses not the number of instructions are the bottleneck in this code. Therefore simplify maintenance by removing unnecessary code. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 4f7e357ac1e2..1d0298870ee3 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -313,45 +313,9 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -/** - * map_id_down_base - Find idmap via binary search in static extent array. - * Can only be called if number of mappings is equal or less than - * UID_GID_MAP_MAX_BASE_EXTENTS. - */ -static struct uid_gid_extent * -map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id) -{ - unsigned idx; - u32 first, last; - - /* Find the matching extent */ - for (idx = 0; idx < extents; idx++) { - first = map->extent[idx].first; - last = first + map->extent[idx].count - 1; - if (id >= first && id <= last) - return &map->extent[idx]; - } - return NULL; -} - static u32 map_id_down(struct uid_gid_map *map, u32 id) { - struct uid_gid_extent *extent; - unsigned extents = map->nr_extents; - smp_rmb(); - - if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - extent = map_id_down_base(extents, map, id); - else - extent = map_id_range_down_max(extents, map, id, 1); - - /* Map the id or note failure */ - if (extent) - id = (id - extent->first) + extent->lower_first; - else - id = (u32) -1; - - return id; + return map_id_range_down(map, id, 1); } /** -- cgit v1.2.3 From 3fda0e737e906ce73220b20c27e7f792d0aac6a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 17:15:30 -0500 Subject: userns: Simplify insert_extent Consolidate the code to write to the new mapping at the end of the function to remove the duplication. Move the increase in the number of mappings into insert_extent, keeping the logic together. Just a small increase in readability and maintainability. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 1d0298870ee3..899c31060ff3 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -758,12 +758,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map, */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { - if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) { - map->extent[map->nr_extents].first = extent->first; - map->extent[map->nr_extents].lower_first = extent->lower_first; - map->extent[map->nr_extents].count = extent->count; - return 0; - } + struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; @@ -784,9 +779,13 @@ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) map->reverse = NULL; } - map->forward[map->nr_extents].first = extent->first; - map->forward[map->nr_extents].lower_first = extent->lower_first; - map->forward[map->nr_extents].count = extent->count; + if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) + dest = &map->extent[map->nr_extents]; + else + dest = &map->forward[map->nr_extents]; + + *dest = *extent; + map->nr_extents++; return 0; } @@ -968,8 +967,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (ret < 0) goto out; ret = -EINVAL; - - new_map.nr_extents++; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) -- cgit v1.2.3 From 638f5b90d46016372a8e3e0a434f199cc5e12b8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 31 Oct 2017 18:16:05 -0700 Subject: bpf: reduce verifier memory consumption the verifier got progressively smarter over time and size of its internal state grew as well. Time to reduce the memory consumption. Before: sizeof(struct bpf_verifier_state) = 6520 After: sizeof(struct bpf_verifier_state) = 896 It's done by observing that majority of BPF programs use little to no stack whereas verifier kept all of 512 stack slots ready always. Instead dynamically reallocate struct verifier state when stack access is detected. Runtime difference before vs after is within a noise. The number of processed instructions stays the same. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 437 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 288 insertions(+), 149 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d906775e12c1..5f26f7ad124f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -276,43 +276,132 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, ")"); } } - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] == STACK_SPILL) - verbose(env, " fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] == STACK_SPILL) + verbose(env, " fp%d=%s", + -MAX_BPF_STACK + i * BPF_REG_SIZE, + reg_type_str[state->stack[i].spilled_ptr.type]); } verbose(env, "\n"); } -static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) +static int copy_stack_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) { - struct bpf_verifier_stack_elem *elem; - int insn_idx; + if (!src->stack) + return 0; + if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { + /* internal bug, make state invalid to reject the program */ + memset(dst, 0, sizeof(*dst)); + return -EFAULT; + } + memcpy(dst->stack, src->stack, + sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); + return 0; +} + +/* do_check() starts with zero-sized stack in struct bpf_verifier_state to + * make it consume minimal amount of memory. check_stack_write() access from + * the program calls into realloc_verifier_state() to grow the stack size. + * Note there is a non-zero 'parent' pointer inside bpf_verifier_state + * which this function copies over. It points to previous bpf_verifier_state + * which is never reallocated + */ +static int realloc_verifier_state(struct bpf_verifier_state *state, int size, + bool copy_old) +{ + u32 old_size = state->allocated_stack; + struct bpf_stack_state *new_stack; + int slot = size / BPF_REG_SIZE; + + if (size <= old_size || !size) { + if (copy_old) + return 0; + state->allocated_stack = slot * BPF_REG_SIZE; + if (!size && old_size) { + kfree(state->stack); + state->stack = NULL; + } + return 0; + } + new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), + GFP_KERNEL); + if (!new_stack) + return -ENOMEM; + if (copy_old) { + if (state->stack) + memcpy(new_stack, state->stack, + sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); + memset(new_stack + old_size / BPF_REG_SIZE, 0, + sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); + } + state->allocated_stack = slot * BPF_REG_SIZE; + kfree(state->stack); + state->stack = new_stack; + return 0; +} + +static void free_verifier_state(struct bpf_verifier_state *state) +{ + kfree(state->stack); + kfree(state); +} + +/* copy verifier state from src to dst growing dst stack space + * when necessary to accommodate larger src stack + */ +static int copy_verifier_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) +{ + int err; + + err = realloc_verifier_state(dst, src->allocated_stack, false); + if (err) + return err; + memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + return copy_stack_state(dst, src); +} + +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, + int *insn_idx) +{ + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_stack_elem *elem, *head = env->head; + int err; if (env->head == NULL) - return -1; + return -ENOENT; - memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); - insn_idx = env->head->insn_idx; + if (cur) { + err = copy_verifier_state(cur, &head->st); + if (err) + return err; + } + if (insn_idx) + *insn_idx = head->insn_idx; if (prev_insn_idx) - *prev_insn_idx = env->head->prev_insn_idx; - elem = env->head->next; - kfree(env->head); + *prev_insn_idx = head->prev_insn_idx; + elem = head->next; + kfree(head); env->head = elem; env->stack_size--; - return insn_idx; + return 0; } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { + struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; + int err; - elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; - memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); + err = copy_verifier_state(&elem->st, cur); + if (err) + return NULL; elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; @@ -325,7 +414,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, return &elem->st; err: /* pop all elements and return */ - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); return NULL; } @@ -550,7 +639,7 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -563,7 +652,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(&env->cur_state, regno); + mark_reg_read(env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -601,10 +690,21 @@ static int check_stack_write(struct bpf_verifier_env *env, struct bpf_verifier_state *state, int off, int size, int value_regno) { - int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + + err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); + if (err) + return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ + if (!env->allow_ptr_leaks && + state->stack[spi].slot_type[0] == STACK_SPILL && + size != BPF_REG_SIZE) { + verbose(env, "attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } if (value_regno >= 0 && is_spillable_regtype(state->regs[value_regno].type)) { @@ -616,17 +716,18 @@ static int check_stack_write(struct bpf_verifier_env *env, } /* save register state */ - state->spilled_regs[spi] = state->regs[value_regno]; - state->spilled_regs[spi].live |= REG_LIVE_WRITTEN; + state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + state->stack[spi].slot_type[i] = STACK_SPILL; } else { /* regular write of data into stack */ - state->spilled_regs[spi] = (struct bpf_reg_state) {}; + state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; for (i = 0; i < size; i++) - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; + state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = + STACK_MISC; } return 0; } @@ -637,10 +738,10 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN) + if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->spilled_regs[slot].live |= REG_LIVE_READ; + parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; } @@ -650,34 +751,37 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_verifier_state *state, int off, int size, int value_regno) { - u8 *slot_type; - int i, spi; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + u8 *stype; - slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; + if (state->allocated_stack <= slot) { + verbose(env, "invalid read from stack off %d+0 size %d\n", + off, size); + return -EACCES; + } + stype = state->stack[spi].slot_type; - if (slot_type[0] == STACK_SPILL) { + if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { verbose(env, "invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { - if (slot_type[i] != STACK_SPILL) { + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { verbose(env, "corrupted spill memory\n"); return -EACCES; } } - spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; - if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->spilled_regs[spi]; + state->regs[value_regno] = state->stack[spi].spilled_ptr; mark_stack_slot_read(state, spi); } return 0; } else { for (i = 0; i < size; i++) { - if (slot_type[i] != STACK_MISC) { + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -694,7 +798,8 @@ static int check_stack_read(struct bpf_verifier_env *env, static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", @@ -706,9 +811,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -783,7 +888,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { @@ -797,7 +902,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; int err; @@ -866,7 +971,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks, static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { - return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); + return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, @@ -968,8 +1073,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int bpf_size, enum bpf_access_type t, int value_regno) { - struct bpf_verifier_state *state = &env->cur_state; - struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_verifier_state *state = env->cur_state; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -993,7 +1099,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, state->regs, value_regno); + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -1028,14 +1134,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(env, state->regs, value_regno); + mark_reg_unknown(env, regs, value_regno); else - mark_reg_known_zero(env, state->regs, + mark_reg_known_zero(env, regs, value_regno); - state->regs[value_regno].id = 0; - state->regs[value_regno].off = 0; - state->regs[value_regno].range = 0; - state->regs[value_regno].type = reg_type; + regs[value_regno].id = 0; + regs[value_regno].off = 0; + regs[value_regno].range = 0; + regs[value_regno].type = reg_type; } } else if (reg->type == PTR_TO_STACK) { @@ -1061,19 +1167,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (env->prog->aux->stack_depth < -off) env->prog->aux->stack_depth = -off; - if (t == BPF_WRITE) { - if (!env->allow_ptr_leaks && - state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && - size != BPF_REG_SIZE) { - verbose(env, "attempt to corrupt spilled pointer on stack\n"); - return -EACCES; - } + if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, value_regno); - } else { + else err = check_stack_read(env, state, off, size, value_regno); - } } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose(env, "cannot write into packet\n"); @@ -1087,7 +1186,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, state->regs, value_regno); + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1095,11 +1194,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && - state->regs[value_regno].type == SCALAR_VALUE) { + regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ - state->regs[value_regno].var_off = tnum_cast( - state->regs[value_regno].var_off, size); - __update_reg_bounds(&state->regs[value_regno]); + regs[value_regno].var_off = + tnum_cast(regs[value_regno].var_off, size); + __update_reg_bounds(®s[value_regno]); } return err; } @@ -1156,9 +1255,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs; - int off, i; + int off, i, slot, spi; if (regs[regno].type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -1198,7 +1297,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { - if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { + slot = -(off + i) - 1; + spi = slot / BPF_REG_SIZE; + if (state->allocated_stack <= slot || + state->stack[spi].slot_type[slot % BPF_REG_SIZE] != + STACK_MISC) { verbose(env, "invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; @@ -1211,7 +1314,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; switch (reg->type) { case PTR_TO_PACKET: @@ -1229,7 +1332,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; int err = 0; @@ -1514,7 +1617,7 @@ static int check_raw_mode(const struct bpf_func_proto *fn) */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -1522,10 +1625,10 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(env, regs, i); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; + reg = &state->stack[i].spilled_ptr; if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(reg); } @@ -1533,9 +1636,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { - struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1603,6 +1705,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return err; } + regs = cur_regs(env); /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -1691,7 +1794,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -1703,13 +1806,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, env->cur_state); verbose(env, "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, env->cur_state); verbose(env, "verifier internal error: known but bad ubounds\n"); return -EINVAL; @@ -1890,7 +1993,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); bool src_known, dst_known; s64 smin_val, smax_val; @@ -2111,7 +2214,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); int rc; @@ -2185,12 +2288,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, env->cur_state); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, env->cur_state); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -2200,7 +2303,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); int err; @@ -2421,10 +2524,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; + reg = &state->stack[i].spilled_ptr; if (reg->type == type && reg->id == dst_reg->id) reg->range = max_t(u16, reg->range, new_range); } @@ -2674,17 +2777,17 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null); + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); } } static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; + struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -2876,7 +2979,7 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -2937,7 +3040,7 @@ static bool may_access_skb(enum bpf_prog_type type) */ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 mode = BPF_MODE(insn->code); int i, err; @@ -2999,7 +3102,7 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; } - reg = &env->cur_state.regs[BPF_REG_0]; + reg = cur_regs(env) + BPF_REG_0; if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); @@ -3363,6 +3466,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } +static bool stacksafe(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + struct idpair *idmap) +{ + int i, spi; + + /* if explored stack has more populated slots than current stack + * such stacks are not equivalent + */ + if (old->allocated_stack > cur->allocated_stack) + return false; + + /* walk slots of the explored stack and ignore any additional + * slots in the current stack, since explored(safe) state + * didn't use them + */ + for (i = 0; i < old->allocated_stack; i++) { + spi = i / BPF_REG_SIZE; + + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) + continue; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ + return false; + if (i % BPF_REG_SIZE) + continue; + if (old->stack[spi].slot_type[0] != STACK_SPILL) + continue; + if (!regsafe(&old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, + idmap)) + /* when explored and current stack slot are both storing + * spilled registers, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} + * but current path has stored: + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + } + return true; +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -3407,37 +3561,8 @@ static bool states_equal(struct bpf_verifier_env *env, goto out_free; } - for (i = 0; i < MAX_BPF_STACK; i++) { - if (old->stack_slot_type[i] == STACK_INVALID) - continue; - if (old->stack_slot_type[i] != cur->stack_slot_type[i]) - /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has has STACK_MISC -> - * this verifier states are not equivalent, - * return false to continue verification of this path - */ - goto out_free; - if (i % BPF_REG_SIZE) - continue; - if (old->stack_slot_type[i] != STACK_SPILL) - continue; - if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE], - &cur->spilled_regs[i / BPF_REG_SIZE], - idmap)) - /* when explored and current stack slot are both storing - * spilled registers, check that stored pointers types - * are the same as well. - * Ex: explored safe path could have stored - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} - * but current path has stored: - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} - * such verifier states are not equivalent. - * return false to continue verification of this path - */ - goto out_free; - else - continue; - } + if (!stacksafe(old, cur, idmap)) + goto out_free; ret = true; out_free: kfree(idmap); @@ -3473,17 +3598,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state, } } /* ... and stack slots */ - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) { - if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].slot_type[0] != STACK_SPILL) continue; - if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - if (parent->spilled_regs[i].live & REG_LIVE_READ) + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; - if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN)) + if (writes && + (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) continue; - if (state->spilled_regs[i].live & REG_LIVE_READ) { - parent->spilled_regs[i].live |= REG_LIVE_READ; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { + parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; touched = true; } } @@ -3513,6 +3640,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; + struct bpf_verifier_state *cur = env->cur_state; int i; sl = env->explored_states[insn_idx]; @@ -3523,7 +3651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(env, &sl->state, &env->cur_state)) { + if (states_equal(env, &sl->state, cur)) { /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -3534,7 +3662,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, &env->cur_state); + propagate_liveness(&sl->state, cur); return 1; } sl = sl->next; @@ -3546,16 +3674,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * it will be rejected. Since there are no loops, we won't be * seeing this 'insn_idx' instruction again on the way to bpf_exit */ - new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); + new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; /* add new state to the head of linked list */ - memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); + copy_verifier_state(&new_sl->state, cur); new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - env->cur_state.parent = &new_sl->state; + cur->parent = &new_sl->state; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -3563,10 +3691,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - env->cur_state.regs[i].live = REG_LIVE_NONE; - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) - if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL) - env->cur_state.spilled_regs[i].live = REG_LIVE_NONE; + cur->regs[i].live = REG_LIVE_NONE; + for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) + if (cur->stack[i].slot_type[0] == STACK_SPILL) + cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; return 0; } @@ -3581,15 +3709,19 @@ static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, static int do_check(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; int insn_cnt = env->prog->len; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; - init_reg_state(env, regs); + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + env->cur_state = state; + init_reg_state(env, state->regs); state->parent = NULL; insn_idx = 0; for (;;) { @@ -3637,7 +3769,7 @@ static int do_check(struct bpf_verifier_env *env) else verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, state); do_print_state = false; } @@ -3651,6 +3783,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + regs = cur_regs(env); if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -3818,8 +3951,10 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - insn_idx = pop_stack(env, &prev_insn_idx); - if (insn_idx < 0) { + err = pop_stack(env, &prev_insn_idx, &insn_idx); + if (err < 0) { + if (err != -ENOENT) + return err; break; } else { do_print_state = true; @@ -4359,9 +4494,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); + free_verifier_state(env->cur_state); + env->cur_state = NULL; skip_full_check: - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); free_states(env); if (ret == 0) @@ -4464,9 +4601,11 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); + free_verifier_state(env->cur_state); + env->cur_state = NULL; skip_full_check: - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); free_states(env); mutex_unlock(&bpf_verifier_lock); -- cgit v1.2.3 From 04686ef299db5ff299bfc5a4504b189e46842078 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 31 Oct 2017 19:17:31 -0700 Subject: bpf: remove SK_REDIRECT from UAPI Now that SK_REDIRECT is no longer a valid return code. Remove it from the UAPI completely. Then do a namespace remapping internal to sockmap so SK_REDIRECT is no longer externally visible. Patchs primary change is to do a namechange from SK_REDIRECT to __SK_REDIRECT Reported-by: Alexei Starovoitov Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 66f00a2b27f4..dbd7b322a86b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -101,13 +101,19 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); } +enum __sk_action { + __SK_DROP = 0, + __SK_PASS, + __SK_REDIRECT, +}; + static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); int rc; if (unlikely(!prog)) - return SK_DROP; + return __SK_DROP; skb_orphan(skb); /* We need to ensure that BPF metadata for maps is also cleared @@ -122,8 +128,10 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) preempt_enable(); skb->sk = NULL; + /* Moving return codes from UAPI namespace into internal namespace */ return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.map ? SK_REDIRECT : SK_PASS) : SK_DROP; + (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP; } static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) @@ -133,7 +141,7 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) rc = smap_verdict_func(psock, skb); switch (rc) { - case SK_REDIRECT: + case __SK_REDIRECT: sk = do_sk_redirect_map(skb); if (likely(sk)) { struct smap_psock *peer = smap_psock_sk(sk); @@ -149,7 +157,7 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) } } /* Fall through and free skb otherwise */ - case SK_DROP: + case __SK_DROP: default: kfree_skb(skb); } -- cgit v1.2.3 From 07c41a295c5f25928a7cb689fdec816bd0089fe8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 30 Oct 2017 13:50:22 -0700 Subject: bpf: avoid rcu_dereference inside bpf_event_mutex lock region During perf event attaching/detaching bpf programs, the tp_event->prog_array change is protected by the bpf_event_mutex lock in both attaching and deteching functions. Although tp_event->prog_array is a rcu pointer, rcu_derefrence is not needed to access it since mutex lock will guarantee ordering. Verified through "make C=2" that sparse locking check still happy with the new change. Also change the label name in perf_event_{attach,detach}_bpf_prog from "out" to "unlock" to reflect the code action after the label. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 136aa6bb0422..506efe6e8ed9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -769,20 +769,19 @@ int perf_event_attach_bpf_prog(struct perf_event *event, mutex_lock(&bpf_event_mutex); if (event->prog) - goto out; + goto unlock; - old_array = rcu_dereference_protected(event->tp_event->prog_array, - lockdep_is_held(&bpf_event_mutex)); + old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); if (ret < 0) - goto out; + goto unlock; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free(old_array); -out: +unlock: mutex_unlock(&bpf_event_mutex); return ret; } @@ -796,11 +795,9 @@ void perf_event_detach_bpf_prog(struct perf_event *event) mutex_lock(&bpf_event_mutex); if (!event->prog) - goto out; - - old_array = rcu_dereference_protected(event->tp_event->prog_array, - lockdep_is_held(&bpf_event_mutex)); + goto unlock; + old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); @@ -812,6 +809,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_put(event->prog); event->prog = NULL; -out: +unlock: mutex_unlock(&bpf_event_mutex); } -- cgit v1.2.3 From 153fbd1226fb30b8630802aa5047b8af5ef53c9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 Oct 2017 11:18:53 +0100 Subject: futex: Fix more put_pi_state() vs. exit_pi_state_list() races Dmitry (through syzbot) reported being able to trigger the WARN in get_pi_state() and a use-after-free on: raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); Both are due to this race: exit_pi_state_list() put_pi_state() lock(&curr->pi_lock) while() { pi_state = list_first_entry(head); hb = hash_futex(&pi_state->key); unlock(&curr->pi_lock); dec_and_test(&pi_state->refcount); lock(&hb->lock) lock(&pi_state->pi_mutex.wait_lock) // uaf if pi_state free'd lock(&curr->pi_lock); .... unlock(&curr->pi_lock); get_pi_state(); // WARN; refcount==0 The problem is we take the reference count too late, and don't allow it being 0. Fix it by using inc_not_zero() and simply retrying the loop when we fail to get a refcount. In that case put_pi_state() should remove the entry from the list. Reported-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Gratian Crisan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: dvhart@infradead.org Cc: syzbot Cc: syzkaller-bugs@googlegroups.com Cc: Fixes: c74aef2d06a9 ("futex: Fix pi_state->owner serialization") Link: http://lkml.kernel.org/r/20171031101853.xpfh72y643kdfhjs@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/futex.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0518a0bfc746..ca5bb9cba5cf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -903,11 +903,27 @@ void exit_pi_state_list(struct task_struct *curr) */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { - next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); + + /* + * We can race against put_pi_state() removing itself from the + * list (a waiter going away). put_pi_state() will first + * decrement the reference count and then modify the list, so + * its possible to see the list entry but fail this reference + * acquire. + * + * In that case; drop the locks to let put_pi_state() make + * progress and retry the loop. + */ + if (!atomic_inc_not_zero(&pi_state->refcount)) { + raw_spin_unlock_irq(&curr->pi_lock); + cpu_relax(); + raw_spin_lock_irq(&curr->pi_lock); + continue; + } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); @@ -918,8 +934,10 @@ void exit_pi_state_list(struct task_struct *curr) * task still owns the PI-state: */ if (head->next != next) { + /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); + put_pi_state(pi_state); continue; } @@ -927,9 +945,8 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - raw_spin_unlock(&curr->pi_lock); - get_pi_state(pi_state); + raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); -- cgit v1.2.3 From 10d94ff4d558b96bfc4f55bb0051ae4d938246fe Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Wed, 1 Nov 2017 10:14:51 +0600 Subject: irq/core: Fix boot crash when the irqaffinity= boot parameter is passed on CPUMASK_OFFSTACK=y kernels(v1) When the irqaffinity= kernel parameter is passed in a CPUMASK_OFFSTACK=y kernel, it fails to boot, because zalloc_cpumask_var() cannot be used before initializing the slab allocator to allocate a cpumask. So, use alloc_bootmem_cpumask_var() instead. Also do some cleanups while at it: in init_irq_default_affinity() remove an #ifdef via using cpumask_available(). Signed-off-by: Rakib Mullick Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171026045800.27087-1-rakib.mullick@gmail.com Link: http://lkml.kernel.org/r/20171101041451.12581-1-rakib.mullick@gmail.com Signed-off-by: Ingo Molnar --- kernel/irq/irqdesc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 982a3576fb01..f2edcf85780d 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) static int __init irq_affinity_setup(char *str) { - zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + alloc_bootmem_cpumask_var(&irq_default_affinity); cpulist_parse(str, irq_default_affinity); /* * Set at least the boot cpu. We don't want to end up with @@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup); static void __init init_irq_default_affinity(void) { -#ifdef CONFIG_CPUMASK_OFFSTACK - if (!irq_default_affinity) + if (!cpumask_available(irq_default_affinity)) zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); -#endif if (cpumask_empty(irq_default_affinity)) cpumask_setall(irq_default_affinity); } -- cgit v1.2.3 From 1969db47f8d0e800397abd4ee4e8d27d2b578587 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Nov 2017 00:08:04 -0700 Subject: bpf: fix verifier memory leaks fix verifier memory leaks Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f26f7ad124f..2bb6d6aa7085 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -341,10 +341,12 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size, return 0; } -static void free_verifier_state(struct bpf_verifier_state *state) +static void free_verifier_state(struct bpf_verifier_state *state, + bool free_self) { kfree(state->stack); - kfree(state); + if (free_self) + kfree(state); } /* copy verifier state from src to dst growing dst stack space @@ -382,6 +384,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (prev_insn_idx) *prev_insn_idx = head->prev_insn_idx; elem = head->next; + free_verifier_state(&head->st, false); kfree(head); env->head = elem; env->stack_size--; @@ -399,14 +402,14 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (!elem) goto err; - err = copy_verifier_state(&elem->st, cur); - if (err) - return NULL; elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; env->head = elem; env->stack_size++; + err = copy_verifier_state(&elem->st, cur); + if (err) + goto err; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose(env, "BPF program is too complex\n"); goto err; @@ -3641,7 +3644,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i; + int i, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -3679,7 +3682,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; /* add new state to the head of linked list */ - copy_verifier_state(&new_sl->state, cur); + err = copy_verifier_state(&new_sl->state, cur); + if (err) { + free_verifier_state(&new_sl->state, false); + kfree(new_sl); + return err; + } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ @@ -4424,6 +4432,7 @@ static void free_states(struct bpf_verifier_env *env) if (sl) while (sl != STATE_LIST_MARK) { sln = sl->next; + free_verifier_state(&sl->state, false); kfree(sl); sl = sln; } @@ -4494,7 +4503,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); - free_verifier_state(env->cur_state); + free_verifier_state(env->cur_state, true); env->cur_state = NULL; skip_full_check: @@ -4601,7 +4610,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); - free_verifier_state(env->cur_state); + free_verifier_state(env->cur_state, true); env->cur_state = NULL; skip_full_check: -- cgit v1.2.3 From 39c82caff8610d57ffe32157cb3130dfabe12fbe Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Thu, 26 Oct 2017 11:37:22 -0700 Subject: clockevents: Update clockevents device next_event on stop clockevent_device::next_event holds the next timer event of a clock event device. The value is updated in clockevents_program_event(), i.e. when the hardware timer is armed for the next expiry. When there are no software timers armed on a CPU, the corresponding per CPU clockevent device is brought into ONESHOT_STOPPED state, but clockevent_device::next_event is not updated, because clockevents_program_event() is not called. So the content of clockevent_device::next_event is stale, which is not an issue when real hardware is used. But the hrtimer broadcast device relies on that information and the stale value causes spurious wakeups. Update clockevent_device::next_event to KTIME_MAX when it has been brought into ONESHOT_STOPPED state to avoid spurious wakeups. This reflects the proper expiry time of the stopped timer: infinity. [ tglx: Massaged changelog ] Signed-off-by: Prasad Sodagudi Signed-off-by: Thomas Gleixner Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/1509043042-32486-1-git-send-email-psodagud@codeaurora.org --- kernel/time/tick-oneshot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 6b009c207671..c1f518e7aa80 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -33,6 +33,7 @@ int tick_program_event(ktime_t expires, int force) * We don't need the clock event device any more, stop it. */ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + dev->next_event = KTIME_MAX; return 0; } -- cgit v1.2.3 From 9c388a5ed1960b2ebbebd3dbe7553092b0c15ec1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Oct 2017 22:32:00 +0100 Subject: watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Guenter reported a crash in the watchdog/perf code, which is caused by cleanup() and enable() running concurrently. The reason for this is: The watchdog functions are serialized via the watchdog_mutex and cpu hotplug locking, but the enable of the perf based watchdog happens in context of the unpark callback of the smpboot thread. But that unpark function is not synchronous inside the locking. The unparking of the thread just wakes it up and leaves so there is no guarantee when the thread is executing. If it starts running _before_ the cleanup happened then it will create a event and overwrite the dead event pointer. The new event is then cleaned up because the event is marked dead. lock(watchdog_mutex); lockup_detector_reconfigure(); cpus_read_lock(); stop(); park() update(); start(); unpark() cpus_read_unlock(); thread runs() overwrite dead event ptr cleanup(); free new event, which is active inside perf.... unlock(watchdog_mutex); The park side is safe as that actually waits for the thread to reach parked state. Commit a33d44843d45 removed the protection against this kind of scenario under the stupid assumption that the hotplug serialization and the watchdog_mutex cover everything. Bring it back. Reverts: a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Reported-and-tested-by: Guenter Roeck Signed-off-by: Thomas Feels-stupid Gleixner Cc: Peter Zijlstra Cc: Don Zickus Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710312145190.1942@nanos --- kernel/watchdog_hld.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71a62ceacdc8..a7f137c1933a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,6 +21,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; @@ -203,6 +204,8 @@ void hardlockup_detector_perf_disable(void) if (event) { perf_event_disable(event); + this_cpu_write(watchdog_ev, NULL); + this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); watchdog_cpus--; } @@ -218,7 +221,7 @@ void hardlockup_detector_perf_cleanup(void) int cpu; for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = per_cpu(dead_event, cpu); /* * Required because for_each_cpu() reports unconditionally @@ -226,7 +229,7 @@ void hardlockup_detector_perf_cleanup(void) */ if (event) perf_event_release_kernel(event); - per_cpu(watchdog_ev, cpu) = NULL; + per_cpu(dead_event, cpu) = NULL; } cpumask_clear(&dead_events_mask); } -- cgit v1.2.3 From 42f930da7f00c0ab23df4c7aed36137f35988980 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 1 Nov 2017 14:11:27 -0400 Subject: watchdog/hardlockup/perf: Use atomics to track in-use cpu counter Guenter reported: There is still a problem. When running echo 6 > /proc/sys/kernel/watchdog_thresh echo 5 > /proc/sys/kernel/watchdog_thresh repeatedly, the message NMI watchdog: Enabled. Permanently consumes one hw-PMU counter. stops after a while (after ~10-30 iterations, with fluctuations). Maybe watchdog_cpus needs to be atomic ? That's correct as this again is affected by the asynchronous nature of the smpboot thread unpark mechanism. CPU 0 CPU1 CPU2 write(watchdog_thresh, 6) stop() park() update() start() unpark() thread->unpark() cnt++; write(watchdog_thresh, 5) thread->unpark() stop() park() thread->park() cnt--; cnt++; update() start() unpark() That's not a functional problem, it just affects the informational message. Convert watchdog_cpus to atomic_t to prevent the problem Reported-and-tested-by: Guenter Roeck Signed-off-by: Don Zickus Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20171101181126.j727fqjmdthjz4xk@redhat.com --- kernel/watchdog_hld.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index a7f137c1933a..a84b205fac9a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "NMI watchdog: " fmt #include +#include #include #include @@ -25,7 +26,7 @@ static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; -static unsigned int watchdog_cpus; +static atomic_t watchdog_cpus = ATOMIC_INIT(0); void arch_touch_nmi_watchdog(void) { @@ -189,7 +190,8 @@ void hardlockup_detector_perf_enable(void) if (hardlockup_detector_event_create()) return; - if (!watchdog_cpus++) + /* use original value for check */ + if (!atomic_fetch_inc(&watchdog_cpus)) pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); perf_event_enable(this_cpu_read(watchdog_ev)); @@ -207,7 +209,7 @@ void hardlockup_detector_perf_disable(void) this_cpu_write(watchdog_ev, NULL); this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); - watchdog_cpus--; + atomic_dec(&watchdog_cpus); } } -- cgit v1.2.3 From c3aff086ea11f17f5c0bec77bdbf4365ba6b41fc Mon Sep 17 00:00:00 2001 From: Andrew Clayton Date: Wed, 1 Nov 2017 15:49:59 +0000 Subject: signal: Fix name of SIGEMT in #if defined() check Commit cc731525f26a ("signal: Remove kernel interal si_code magic") added a check for SIGMET and NSIGEMT being defined. That SIGMET should in fact be SIGEMT, with SIGEMT being defined in arch/{alpha,mips,sparc}/include/uapi/asm/signal.h This was actually pointed out by BenHutchings in a lwn.net comment here https://lwn.net/Comments/734608/ Fixes: cc731525f26a ("signal: Remove kernel interal si_code magic") Signed-off-by: Andrew Clayton Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 800a18f77732..8dcd8825b2de 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2698,7 +2698,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, [SIGBUS] = { NSIGBUS, SIL_FAULT }, [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, -#if defined(SIGMET) && defined(NSIGEMT) +#if defined(SIGEMT) && defined(NSIGEMT) [SIGEMT] = { NSIGEMT, SIL_FAULT }, #endif [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, -- cgit v1.2.3 From 03c4cc385faaa46e5877f499c6b997fef792f8d3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 1 Nov 2017 12:44:45 +0100 Subject: bpf: cpumap micro-optimization in cpu_map_enqueue Discovered that the compiler laid-out asm code in suboptimal way when studying perf report during benchmarking of cpumap. Help the compiler by the marking unlikely code paths. Signed-off-by: Jesper Dangaard Brouer Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 86e29cbf7827..ce5b669003b2 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -208,7 +208,7 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; - if ((headroom - metasize) < sizeof(*xdp_pkt)) + if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) return NULL; /* Store info in top of packet */ @@ -656,7 +656,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct xdp_pkt *xdp_pkt; xdp_pkt = convert_to_xdp_pkt(xdp); - if (!xdp_pkt) + if (unlikely(!xdp_pkt)) return -EOVERFLOW; /* Info needed when constructing SKB on remote CPU */ -- cgit v1.2.3 From b06723da824af1e979eb1699623881b5f45a783c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 1 Nov 2017 23:58:09 +0100 Subject: bpf: minor cleanups after merge Two minor cleanups after Dave's recent merge in f8ddadc4db6c ("Merge git://git.kernel.org...") of net into net-next in order to get the code in line with what was done originally in the net tree: i) use max() instead of max_t() since both ranges are u16, ii) don't split the direct access test cases in the middle with bpf_exit test cases from 390ee7e29fc ("bpf: enforce return code for cgroup-bpf programs"). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2bb6d6aa7085..2cc3e9486a1f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2532,7 +2532,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, continue; reg = &state->stack[i].spilled_ptr; if (reg->type == type && reg->id == dst_reg->id) - reg->range = max_t(u16, reg->range, new_range); + reg->range = max(reg->range, new_range); } } -- cgit v1.2.3 From 5beca081be9195b4316ac5f32921df0234ee8e0e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 1 Nov 2017 23:58:10 +0100 Subject: bpf: also improve pattern matches for meta access Follow-up to 0fd4759c5515 ("bpf: fix pattern matches for direct packet access") to cover also the remaining data_meta/data matches in the verifier. The matches are also refactored a bit to simplify handling of all the cases. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 165 +++++++++++++++++++++++++++++--------------------- 1 file changed, 96 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cc3e9486a1f..530b68550fd2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2787,6 +2787,99 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, } } +static bool try_match_pkt_pointers(const struct bpf_insn *insn, + struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg, + struct bpf_verifier_state *this_branch, + struct bpf_verifier_state *other_branch) +{ + if (BPF_SRC(insn->code) != BPF_X) + return false; + + switch (BPF_OP(insn->code)) { + case BPF_JGT: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ + find_good_pkt_pointers(this_branch, dst_reg, + dst_reg->type, false); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end > pkt_data', pkt_data > pkt_meta' */ + find_good_pkt_pointers(other_branch, src_reg, + src_reg->type, true); + } else { + return false; + } + break; + case BPF_JLT: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ + find_good_pkt_pointers(other_branch, dst_reg, + dst_reg->type, true); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end < pkt_data', pkt_data > pkt_meta' */ + find_good_pkt_pointers(this_branch, src_reg, + src_reg->type, false); + } else { + return false; + } + break; + case BPF_JGE: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ + find_good_pkt_pointers(this_branch, dst_reg, + dst_reg->type, true); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ + find_good_pkt_pointers(other_branch, src_reg, + src_reg->type, false); + } else { + return false; + } + break; + case BPF_JLE: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ + find_good_pkt_pointers(other_branch, dst_reg, + dst_reg->type, false); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ + find_good_pkt_pointers(this_branch, src_reg, + src_reg->type, true); + } else { + return false; + } + break; + default: + return false; + } + + return true; +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -2893,75 +2986,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' > pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end > pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], - PTR_TO_PACKET, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' < pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET, - true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end < pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' >= pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, - PTR_TO_PACKET, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end >= pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' <= pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end <= pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], - PTR_TO_PACKET, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { - find_good_pkt_pointers(this_branch, dst_reg, - PTR_TO_PACKET_META, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { - find_good_pkt_pointers(other_branch, dst_reg, - PTR_TO_PACKET_META, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - regs[insn->src_reg].type == PTR_TO_PACKET_META) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], - PTR_TO_PACKET_META, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - regs[insn->src_reg].type == PTR_TO_PACKET_META) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], - PTR_TO_PACKET_META, false); - } else if (is_pointer_value(env, insn->dst_reg)) { + } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], + this_branch, other_branch) && + is_pointer_value(env, insn->dst_reg)) { verbose(env, "R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; -- cgit v1.2.3 From b24413180f5600bcb3bb70fbed5cf186b60864bd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 1 Nov 2017 15:07:57 +0100 Subject: License cleanup: add SPDX GPL-2.0 license identifier to files with no license Many source files in the tree are missing licensing information, which makes it harder for compliance tools to determine the correct license. By default all files without license information are under the default license of the kernel, which is GPL version 2. Update the files which contain no license information with the 'GPL-2.0' SPDX license identifier. The SPDX identifier is a legally binding shorthand, which can be used instead of the full boiler plate text. This patch is based on work done by Thomas Gleixner and Kate Stewart and Philippe Ombredanne. How this work was done: Patches were generated and checked against linux-4.14-rc6 for a subset of the use cases: - file had no licensing information it it. - file was a */uapi/* one with no licensing information in it, - file was a */uapi/* one with existing licensing information, Further patches will be generated in subsequent months to fix up cases where non-standard license headers were used, and references to license had to be inferred by heuristics based on keywords. The analysis to determine which SPDX License Identifier to be applied to a file was done in a spreadsheet of side by side results from of the output of two independent scanners (ScanCode & Windriver) producing SPDX tag:value files created by Philippe Ombredanne. Philippe prepared the base worksheet, and did an initial spot review of a few 1000 files. The 4.13 kernel was the starting point of the analysis with 60,537 files assessed. Kate Stewart did a file by file comparison of the scanner results in the spreadsheet to determine which SPDX license identifier(s) to be applied to the file. She confirmed any determination that was not immediately clear with lawyers working with the Linux Foundation. Criteria used to select files for SPDX license identifier tagging was: - Files considered eligible had to be source code files. - Make and config files were included as candidates if they contained >5 lines of source - File already had some variant of a license header in it (even if <5 lines). All documentation files were explicitly excluded. The following heuristics were used to determine which SPDX license identifiers to apply. - when both scanners couldn't find any license traces, file was considered to have no license information in it, and the top level COPYING file license applied. For non */uapi/* files that summary was: SPDX license identifier # files ---------------------------------------------------|------- GPL-2.0 11139 and resulted in the first patch in this series. If that file was a */uapi/* path one, it was "GPL-2.0 WITH Linux-syscall-note" otherwise it was "GPL-2.0". Results of that was: SPDX license identifier # files ---------------------------------------------------|------- GPL-2.0 WITH Linux-syscall-note 930 and resulted in the second patch in this series. - if a file had some form of licensing information in it, and was one of the */uapi/* ones, it was denoted with the Linux-syscall-note if any GPL family license was found in the file or had no licensing in it (per prior point). Results summary: SPDX license identifier # files ---------------------------------------------------|------ GPL-2.0 WITH Linux-syscall-note 270 GPL-2.0+ WITH Linux-syscall-note 169 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) 21 ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) 17 LGPL-2.1+ WITH Linux-syscall-note 15 GPL-1.0+ WITH Linux-syscall-note 14 ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) 5 LGPL-2.0+ WITH Linux-syscall-note 4 LGPL-2.1 WITH Linux-syscall-note 3 ((GPL-2.0 WITH Linux-syscall-note) OR MIT) 3 ((GPL-2.0 WITH Linux-syscall-note) AND MIT) 1 and that resulted in the third patch in this series. - when the two scanners agreed on the detected license(s), that became the concluded license(s). - when there was disagreement between the two scanners (one detected a license but the other didn't, or they both detected different licenses) a manual inspection of the file occurred. - In most cases a manual inspection of the information in the file resulted in a clear resolution of the license that should apply (and which scanner probably needed to revisit its heuristics). - When it was not immediately clear, the license identifier was confirmed with lawyers working with the Linux Foundation. - If there was any question as to the appropriate license identifier, the file was flagged for further research and to be revisited later in time. In total, over 70 hours of logged manual review was done on the spreadsheet to determine the SPDX license identifiers to apply to the source files by Kate, Philippe, Thomas and, in some cases, confirmation by lawyers working with the Linux Foundation. Kate also obtained a third independent scan of the 4.13 code base from FOSSology, and compared selected files where the other two scanners disagreed against that SPDX file, to see if there was new insights. The Windriver scanner is based on an older version of FOSSology in part, so they are related. Thomas did random spot checks in about 500 files from the spreadsheets for the uapi headers and agreed with SPDX license identifier in the files he inspected. For the non-uapi files Thomas did random spot checks in about 15000 files. In initial set of patches against 4.14-rc6, 3 files were found to have copy/paste license identifier errors, and have been fixed to reflect the correct identifier. Additionally Philippe spent 10 hours this week doing a detailed manual inspection and review of the 12,461 patched files from the initial patch version early this week with: - a full scancode scan run, collecting the matched texts, detected license ids and scores - reviewing anything where there was a license detected (about 500+ files) to ensure that the applied SPDX license was correct - reviewing anything where there was no detection but the patch license was not GPL-2.0 WITH Linux-syscall-note to ensure that the applied SPDX license was correct This produced a worksheet with 20 files needing minor correction. This worksheet was then exported into 3 different .csv files for the different types of files to be modified. These .csv files were then reviewed by Greg. Thomas wrote a script to parse the csv files and add the proper SPDX tag to the file, in the format that the file expected. This script was further refined by Greg based on the output to detect more types of files automatically and to distinguish between header and source .c files (which need different comment types.) Finally Greg ran the script using the .csv files to generate the patches. Reviewed-by: Kate Stewart Reviewed-by: Philippe Ombredanne Reviewed-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/Makefile | 1 + kernel/acct.c | 1 + kernel/audit_tree.c | 1 + kernel/bounds.c | 1 + kernel/bpf/Makefile | 1 + kernel/capability.c | 1 + kernel/cgroup/Makefile | 1 + kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/debug.c | 1 + kernel/cgroup/namespace.c | 1 + kernel/dma.c | 1 + kernel/elfcore.c | 1 + kernel/events/Makefile | 1 + kernel/events/internal.h | 1 + kernel/exec_domain.c | 1 + kernel/futex_compat.c | 1 + kernel/gcov/Makefile | 1 + kernel/gcov/base.c | 1 + kernel/gcov/fs.c | 1 + kernel/gcov/gcc_3_4.c | 1 + kernel/gcov/gcc_4_7.c | 1 + kernel/gcov/gcov.h | 1 + kernel/groups.c | 1 + kernel/irq/Makefile | 1 + kernel/irq/affinity.c | 1 + kernel/irq/autoprobe.c | 1 + kernel/irq/debug.h | 1 + kernel/irq/internals.h | 1 + kernel/irq/migration.c | 1 + kernel/irq/proc.c | 1 + kernel/irq/resend.c | 1 + kernel/irq/settings.h | 1 + kernel/irq/spurious.c | 1 + kernel/kcmp.c | 1 + kernel/kcov.c | 1 + kernel/kexec_internal.h | 1 + kernel/livepatch/core.h | 1 + kernel/livepatch/patch.h | 1 + kernel/livepatch/transition.h | 1 + kernel/locking/Makefile | 1 + kernel/locking/lockdep_internals.h | 1 + kernel/locking/lockdep_proc.c | 1 + kernel/locking/mcs_spinlock.h | 1 + kernel/locking/mutex-debug.h | 1 + kernel/locking/mutex.h | 1 + kernel/locking/osq_lock.c | 1 + kernel/locking/qspinlock_paravirt.h | 1 + kernel/locking/rtmutex-debug.c | 1 + kernel/locking/rtmutex-debug.h | 1 + kernel/locking/rtmutex.h | 1 + kernel/locking/rtmutex_common.h | 1 + kernel/locking/rwsem-spinlock.c | 1 + kernel/locking/rwsem-xadd.c | 1 + kernel/locking/rwsem.c | 1 + kernel/locking/rwsem.h | 1 + kernel/locking/spinlock.c | 1 + kernel/power/Makefile | 1 + kernel/power/autosleep.c | 1 + kernel/power/console.c | 1 + kernel/power/power.h | 1 + kernel/power/process.c | 1 + kernel/power/wakelock.c | 1 + kernel/printk/braille.c | 1 + kernel/printk/braille.h | 1 + kernel/printk/console_cmdline.h | 1 + kernel/range.c | 1 + kernel/rcu/Makefile | 1 + kernel/sched/Makefile | 1 + kernel/sched/autogroup.c | 1 + kernel/sched/autogroup.h | 1 + kernel/sched/completion.c | 1 + kernel/sched/cpuacct.c | 1 + kernel/sched/cpuacct.h | 1 + kernel/sched/cpudeadline.h | 1 + kernel/sched/cpupri.h | 1 + kernel/sched/deadline.c | 1 + kernel/sched/fair.c | 1 + kernel/sched/features.h | 1 + kernel/sched/idle_task.c | 1 + kernel/sched/loadavg.c | 1 + kernel/sched/rt.c | 1 + kernel/sched/sched-pelt.h | 1 + kernel/sched/sched.h | 1 + kernel/sched/stats.c | 1 + kernel/sched/stats.h | 1 + kernel/sched/stop_task.c | 1 + kernel/sched/swait.c | 1 + kernel/sched/topology.c | 1 + kernel/seccomp.c | 1 + kernel/smpboot.h | 1 + kernel/sys.c | 1 + kernel/sys_ni.c | 1 + kernel/sysctl_binary.c | 1 + kernel/task_work.c | 1 + kernel/time/Makefile | 1 + kernel/time/itimer.c | 1 + kernel/time/ntp.c | 1 + kernel/time/ntp_internal.h | 1 + kernel/time/posix-cpu-timers.c | 1 + kernel/time/posix-timers.h | 1 + kernel/time/tick-broadcast-hrtimer.c | 1 + kernel/time/tick-internal.h | 1 + kernel/time/tick-sched.h | 1 + kernel/time/timekeeping.h | 1 + kernel/time/timekeeping_internal.h | 1 + kernel/trace/Makefile | 1 + kernel/trace/power-traces.c | 1 + kernel/trace/rpm-traces.c | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_benchmark.c | 1 + kernel/trace/trace_benchmark.h | 1 + kernel/trace/trace_branch.c | 1 + kernel/trace/trace_entries.h | 1 + kernel/trace/trace_events_filter_test.h | 1 + kernel/trace/trace_export.c | 1 + kernel/trace/trace_functions.c | 1 + kernel/trace/trace_functions_graph.c | 1 + kernel/trace/trace_kdb.c | 1 + kernel/trace/trace_mmiotrace.c | 1 + kernel/trace/trace_nop.c | 1 + kernel/trace/trace_output.h | 1 + kernel/trace/trace_sched_switch.c | 1 + kernel/trace/trace_sched_wakeup.c | 1 + kernel/trace/trace_selftest.c | 1 + kernel/trace/trace_selftest_dynamic.c | 1 + kernel/trace/trace_stack.c | 1 + kernel/trace/trace_stat.c | 1 + kernel/trace/trace_stat.h | 1 + kernel/trace/trace_syscalls.c | 1 + kernel/trace/tracing_map.h | 1 + kernel/uid16.c | 1 + kernel/watchdog.c | 1 + kernel/watchdog_hld.c | 1 + kernel/workqueue_internal.h | 1 + 134 files changed, 134 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index ed470aac53da..172d151d429c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux kernel. # diff --git a/kernel/acct.c b/kernel/acct.c index 5e72af29ab73..6670fbd3e466 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/acct.c * diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 011d46e5f73f..d4b050d9a66e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "audit.h" #include #include diff --git a/kernel/bounds.c b/kernel/bounds.c index e1d1d1952bfa..c373e887c066 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generate definitions needed by the preprocessor. * This code generates raw asm output which is post-processed diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 897daa005b23..af3ab6164ff5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o diff --git a/kernel/capability.c b/kernel/capability.c index f97fe77ceb88..1e1c0236f55b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/capability.c * diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ce693ccb8c58..ae448f7632cc 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-y := cgroup.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5151ff256c29..bf54ade001be 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CGROUP_INTERNAL_H #define __CGROUP_INTERNAL_H diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index f661b4cc5efd..5f780d8f6a9d 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Debug controller * diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 66129eb4371d..b05f1dd58a62 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "cgroup-internal.h" #include diff --git a/kernel/dma.c b/kernel/dma.c index 6c6262f86c17..3506fc34a712 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. * diff --git a/kernel/elfcore.c b/kernel/elfcore.c index e556751d15d9..fc482c8e0bd8 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 2925188f50ea..3c022e33c109 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) endif diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 843e97047335..09b1537ae06c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _KERNEL_EVENTS_INTERNAL_H #define _KERNEL_EVENTS_INTERNAL_H diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 6873bb3e6b7e..0975b0268545 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Handling of different ABIs (personalities). * diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 3f409968e466..83f830acbb5f 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/futex_compat.c * diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 752d6486b67e..c6c50e5c680e 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index c51a49c9be70..9c7c8d5c18f2 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This code maintains a list of active profiling data structures. * diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index edf67c493a8e..6e40ff6be083 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This code exports profiling data as debugfs files to userspace. * diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 27bc88a35013..1e32e66c9563 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This code provides functions to handle gcc's profiling data format * introduced with gcc 3.4. Future versions of gcc may change the gcov diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 46a18e72bce6..ca5e5c0ef853 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This code provides functions to handle gcc's profiling data format * introduced with gcc 4.7. diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 92c8e22a29ed..de118ad4a024 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Profiling infrastructure declarations. * diff --git a/kernel/groups.c b/kernel/groups.c index 434f6665f187..e357bc800111 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Supplementary group IDs */ diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1970cafe8f2a..ed15d142694b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_IRQ_TIMINGS) += timings.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d69bd77252a7..e12d35108225 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2016 Thomas Gleixner. * Copyright (C) 2016-2017 Christoph Hellwig. diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index d30a0dd5cc02..befa671fba64 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/irq/autoprobe.c * diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index e75e29e4434a..17f05ef8f575 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Debugging printout: */ diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a4aa39009f0d..44ed5f8c8759 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * IRQ subsystem internal functions and variables: * diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 6ca054a3f91d..86ae0eb80b53 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6376b4a598d3..c010cc0daf79 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/irq/proc.c * diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index b86886beee4f..1d08f45135c2 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/irq/resend.c * diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 320579d89091..e43795cd2ccf 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Internal header to deal with irq_desc->status which will be renamed * to irq_desc->settings. diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 061ba7eed4ed..987d7bca4864 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/irq/spurious.c * diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 055bb2962a0b..a0e3d7a0e8b8 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/kcov.c b/kernel/kcov.c index 3f693a0f6f3e..fc6af9e1308b 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) "kcov: " fmt #define DISABLE_BRANCH_PROFILING diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 50dfcb039a41..48aaf2ac0d0d 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_KEXEC_INTERNAL_H #define LINUX_KEXEC_INTERNAL_H diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index c74f24c47837..a351601d7f76 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LIVEPATCH_CORE_H #define _LIVEPATCH_CORE_H diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h index 0db227170c36..e72d8250d04b 100644 --- a/kernel/livepatch/patch.h +++ b/kernel/livepatch/patch.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LIVEPATCH_PATCH_H #define _LIVEPATCH_PATCH_H diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index ce09b326546c..0f6e27c481f9 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LIVEPATCH_TRANSITION_H #define _LIVEPATCH_TRANSITION_H diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 760158d9d98d..392c7f23af76 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # Any varying coverage in these files is non-deterministic # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 1da4669d57a7..d459d624ba2a 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * kernel/lockdep_internals.h * diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 68d9e267ccd4..ad69bbc9bd28 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * kernel/lockdep_proc.c * diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 6a385aabcce7..f046b7ce9dd6 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * MCS lock defines * diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 4174417d5309..1edd3f45a4ec 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Mutexes: blocking mutual exclusion locks * diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 6ebc1902f779..1c2287d3fa71 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Mutexes: blocking mutual exclusion locks * diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index a74ee6abd039..6ef600aa0f47 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 43555681c40b..15b6a39366c6 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _GEN_PV_LOCK_SLOWPATH #error "do not include this file" #endif diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index f4a74e78d467..fd4fe1f5b458 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * RT-Mutexes: blocking mutual exclusion locks with PI support * diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 5078c6ddf4a5..fc549713bba3 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * RT-Mutexes: blocking mutual exclusion locks with PI support * diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 5c253caffe91..732f96abf462 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * RT-Mutexes: blocking mutual exclusion locks with PI support * diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7453be0485a5..124e98ca0b17 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * RT Mutexes: blocking mutual exclusion locks with PI support * diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 0848634c5512..a7ffb2a96ede 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* rwsem-spinlock.c: R/W semaphores: contention handling functions for * generic spinlock implementation * diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 1fefe6dcafd7..e795908f3607 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* rwsem.c: R/W semaphores: contention handling functions * * Written by David Howells (dhowells@redhat.com). diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 4d48b1c4870d..a6c76a4832b4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* kernel/rwsem.c: R/W semaphores, public implementation * * Written by David Howells (dhowells@redhat.com). diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a699f4048ba1..a883b8f1fdc6 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * The owner field of the rw_semaphore structure will be set to * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..6e40fdfba326 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (2004) Linus Torvalds * diff --git a/kernel/power/Makefile b/kernel/power/Makefile index eb4f717705ba..a3f79f0eef36 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index 9012ecf7b814..41e83a779e19 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * kernel/power/autosleep.c * diff --git a/kernel/power/console.c b/kernel/power/console.c index 0e781798b0b3..fcdf0e14a47d 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Functions for saving/restoring console. * diff --git a/kernel/power/power.h b/kernel/power/power.h index 1d2d761e3c25..f29cd178df90 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include diff --git a/kernel/power/process.c b/kernel/power/process.c index 50f25cb370c6..7381d49a44db 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * drivers/power/process.c - Functions for starting/stopping processes on * suspend transitions. diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 1896386e16bb..dfba59be190b 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * kernel/power/wakelock.c * diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 61d41ca41844..1d21ebacfdb8 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h index 749a6756843a..123154f86304 100644 --- a/kernel/printk/braille.h +++ b/kernel/printk/braille.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PRINTK_BRAILLE_H #define _PRINTK_BRAILLE_H diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index 2ca4a8b5fe57..11f19c466af5 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _CONSOLE_CMDLINE_H #define _CONSOLE_CMDLINE_H diff --git a/kernel/range.c b/kernel/range.c index 82cfc285b046..d84de6766472 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Range add and subtract */ diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 13c0fc852767..020e8b6a644b 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # Any varying coverage in these files is non-deterministic # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 78f54932ea1d..a9ee16bbc693 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index de6d7f4dfcb5..a43df5193538 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "sched.h" #include diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index ce40c810cd5c..27cd22b89824 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_SCHED_AUTOGROUP #include diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index cc873075c3bd..2ddaec40956f 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic wait-for-completion handler; * diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f95ab29a45d0..44ab32a4fab6 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ba72807c73d4..a8358a57a316 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_CGROUP_CPUACCT extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index f7da8c55bba0..b010d26e108e 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CPUDL_H #define _LINUX_CPUDL_H diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 63cbb9ca0496..bab050019071 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CPUPRI_H #define _LINUX_CPUPRI_H diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0191ec7667c3..4ae5c1ea90e2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Deadline Scheduling Class (SCHED_DEADLINE) * diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d3f3094856fe..5c09ddf8c832 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) * diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 319ed0e8a347..9552fd5854bf 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Only give sleepers 50% of their service deficit. This allows * them to run sooner, but does not allow tons of sleepers to diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 0c00172db63e..d518664cce4f 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "sched.h" /* diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f14716a3522f..89a989e4d758 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * kernel/sched/loadavg.c * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0af5ca9e3e3f..3c96c80e0992 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index cd200d16529e..a26473674fb7 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ static const u32 runnable_avg_yN_inv[] = { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14db76cd496f..3b448ba82225 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include #include diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 87e2c9f0c33e..940b1fa1d2ce 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index d5710651043b..baf500d12b7c 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 9f69fb630853..45caf90b24cd 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "sched.h" /* diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 2227e183e202..9ff1555341ed 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f1cf4f306a82..6798276d29af 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Scheduler topology setup/handling methods */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ae832e13b97..418a1c045933 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/seccomp.c * diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 485b81cfab34..34dd3d7ba40b 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef SMPBOOT_H #define SMPBOOT_H diff --git a/kernel/sys.c b/kernel/sys.c index 9aebc2935013..524a4cb9bbe2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/sys.c * diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 8acef8576ce9..b5189762d275 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 58ea8c03662e..e8c0dab4fd65 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include "../fs/xfs/xfs_sysctl.h" diff --git a/kernel/task_work.c b/kernel/task_work.c index 836a72a66fba..5718b3ea202a 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 938dbf33ef49..f1e46f338a9c 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-y += time.o timer.o hrtimer.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 2ef98a02376a..f26acef5d7b4 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/itimer.c * diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index edf19cc53140..99e03bec68e4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NTP state machine interfaces and logic. * diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index d8a7c11fa71a..0a53e6ea47b1 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NTP_INTERNAL_H #define _LINUX_NTP_INTERNAL_H diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8585ad6e472a..5b117110b55b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Implement CPU time clocks for the POSIX clock interface. */ diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index fb303c3be4d3..151e28f5bf30 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #define TIMER_RETRY 1 struct k_clock { diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a7bb8f33ae07..58045eb976c3 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/time/tick-broadcast-hrtimer.c * This file emulates a local clock event device diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index be0ac01f2e12..f8e1845aa464 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * tick internal variable and functions used by low/high res code */ diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 075444e3d48e..954b43dbf21c 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TICK_SCHED_H #define _TICK_SCHED_H diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index d0914676d4c5..c9f9af339914 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _KERNEL_TIME_TIMEKEEPING_H #define _KERNEL_TIME_TIMEKEEPING_H /* diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 9a18f121f399..fdbeeb02dde9 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TIMEKEEPING_INTERNAL_H #define _TIMEKEEPING_INTERNAL_H /* diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 90f2701d92a7..19a15b2f1190 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # Do not instrument the tracer itself: diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 0c7dee221dca..21bb161c2316 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Power trace points * diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c index 4b3b5eaf94d1..25dec0b00280 100644 --- a/kernel/trace/rpm-traces.c +++ b/kernel/trace/rpm-traces.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Power trace points * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 652c682707cd..401b0639116f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 16a8cf02eee9..79f838a75077 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index ebdbfc2f2a64..be1d86ff753d 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM benchmark diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4d8fdf3184dc..4ad967453b6f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * unlikely profiler * diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index adcdbbeae010..e954ae3d82c0 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This file defines the trace event structures that go into the ring * buffer directly. They are created via macros so that changes for them diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h index bfd4dba0d603..39d7ef4f57cb 100644 --- a/kernel/trace/trace_events_filter_test.h +++ b/kernel/trace/trace_events_filter_test.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM test diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 39aa7aa66468..548e62eb5c46 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_export.c - export basic ftrace utilities to user space * diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a0910c0cdf2e..27f7ad12c4b1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ring buffer based function tracer * diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b8f1f54731af..23c0b0cb5fb9 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * * Function graph tracer. diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 57149bce6aad..d953c163a079 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * kdb helper for dumping the ftrace buffer * diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index dca78fc48439..b0388016b687 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Memory mapped I/O tracing * diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 49f61fe96a6b..50523f953a5d 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * nop tracer * diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index fabc49bcd493..dbba03ed96de 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index b341c02730be..e288168661e1 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace context switch * diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0c331978b1a6..7d461dcd4831 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace task wakeup timings * diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b17ec642793b..cd70eb5df38e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Include in trace.c */ #include diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index b4c475a0a48b..8cda06a10d66 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "trace.h" int DYN_FTRACE_TEST_NAME(void) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 49cb41412eec..719a52a4064a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Steven Rostedt * diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 413ff108fbd0..75bf1bcb4a8a 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Infrastructure for statistic tracing (histogram output). * diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 8f03914b9a6a..76d30b4ebe83 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __TRACE_STAT_H #define __TRACE_STAT_H diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 696afe72d3b1..a2a642f2c64f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 618838f5f30a..ab0ca77331d0 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __TRACING_MAP_H #define __TRACING_MAP_H diff --git a/kernel/uid16.c b/kernel/uid16.c index 5c2dc5b2bf4f..ce74a4901d2b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Wrapper functions for 16bit uid back compatibility. All nicely tied * together in the faint hope we can take the out in five years time. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6bcb854909c0..c8e06703e44c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Detect hard and soft lockups on a system * diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71a62ceacdc8..4583feb66393 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Detect hard lockups on a system * diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 8635417c587b..efdd72e15794 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * kernel/workqueue_internal.h * -- cgit v1.2.3 From 6082a6e44434a17f194048b7d48df56f148ec6d4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 1 Nov 2017 11:04:51 -0700 Subject: kernel/time/Kconfig: Fix typo in comment Fix typo in Kconfig comment text. Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Cc: Jiri Kosina Link: https://lkml.kernel.org/r/0e586dd4-2b27-864e-c252-bc72df52fd01@infradead.org --- kernel/time/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index ac09bc29eb08..d689a9557e17 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -56,7 +56,7 @@ menu "Timers subsystem" # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices -# are supported independ of this. +# are supported independent of this. config TICK_ONESHOT bool -- cgit v1.2.3 From e78c38f6bdd900b2ad9ac9df8eff58b745dc5b3c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 23 Oct 2017 13:41:51 +0200 Subject: futex: futex_wake_op, do not fail on invalid op In commit 30d6e0a4190d ("futex: Remove duplicated code and fix undefined behaviour"), I let FUTEX_WAKE_OP to fail on invalid op. Namely when op should be considered as shift and the shift is out of range (< 0 or > 31). But strace's test suite does this madness: futex(0x7fabd78bcffc, 0x5, 0xfacefeed, 0xb, 0x7fabd78bcffc, 0xa0caffee); futex(0x7fabd78bcffc, 0x5, 0xfacefeed, 0xb, 0x7fabd78bcffc, 0xbadfaced); futex(0x7fabd78bcffc, 0x5, 0xfacefeed, 0xb, 0x7fabd78bcffc, 0xffffffff); When I pick the first 0xa0caffee, it decodes as: 0x80000000 & 0xa0caffee: oparg is shift 0x70000000 & 0xa0caffee: op is FUTEX_OP_OR 0x0f000000 & 0xa0caffee: cmp is FUTEX_OP_CMP_EQ 0x00fff000 & 0xa0caffee: oparg is sign-extended 0xcaf = -849 0x00000fff & 0xa0caffee: cmparg is sign-extended 0xfee = -18 That means the op tries to do this: (futex |= (1 << (-849))) == -18 which is completely bogus. The new check of op in the code is: if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) return -EINVAL; oparg = 1 << oparg; } which results obviously in the "Invalid argument" errno: FAIL: futex =========== futex(0x7fabd78bcffc, 0x5, 0xfacefeed, 0xb, 0x7fabd78bcffc, 0xa0caffee) = -1: Invalid argument futex.test: failed test: ../futex failed with code 1 So let us soften the failure to print only a (ratelimited) message, crop the value and continue as if it were right. When userspace keeps up, we can switch this to return -EINVAL again. [v2] Do not return 0 immediatelly, proceed with the cropped value. Fixes: 30d6e0a4190d ("futex: Remove duplicated code and fix undefined behaviour") Signed-off-by: Jiri Slaby Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Darren Hart Signed-off-by: Linus Torvalds --- kernel/futex.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0518a0bfc746..0d638f008bb1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1570,8 +1570,16 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { - if (oparg < 0 || oparg > 31) - return -EINVAL; + if (oparg < 0 || oparg > 31) { + char comm[sizeof(current->comm)]; + /* + * kill this print and return -EINVAL when userspace + * is sane again + */ + pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", + get_task_comm(comm, current), oparg); + oparg &= 31; + } oparg = 1 << oparg; } -- cgit v1.2.3 From fd30b717b86dc30ffe25596f8de6542a02ae9401 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 17:58:54 -0700 Subject: rcu: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Kees Cook --- kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/tree_plugin.h | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 45f2ffbc1e78..96a3cdaeed91 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1076,7 +1076,7 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) * counter in the element should never be greater than 1, otherwise, the * RCU implementation is broken. */ -static void rcu_torture_timer(unsigned long unused) +static void rcu_torture_timer(struct timer_list *unused) { int idx; unsigned long started; @@ -1163,7 +1163,7 @@ rcu_torture_reader(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) - setup_timer_on_stack(&t, rcu_torture_timer, 0); + timer_setup_on_stack(&t, rcu_torture_timer, 0); do { if (irqreader && cur_ops->irq_capable) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..e85946d9843b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2261,9 +2261,11 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ -static void do_nocb_deferred_wakeup_timer(unsigned long x) +static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { - do_nocb_deferred_wakeup_common((struct rcu_data *)x); + struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + + do_nocb_deferred_wakeup_common(rdp); } /* @@ -2327,8 +2329,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_lock_init(&rdp->nocb_lock); - setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, - (unsigned long)rdp); + timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); } /* -- cgit v1.2.3 From 7cce782ef32f5003c18ab8f9f586f2ed5ce2c33e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:05:51 +0100 Subject: bpf: fix link error without CONFIG_NET I ran into this link error with the latest net-next plus linux-next trees when networking is disabled: kernel/bpf/verifier.o:(.rodata+0x2958): undefined reference to `tc_cls_act_analyzer_ops' kernel/bpf/verifier.o:(.rodata+0x2970): undefined reference to `xdp_analyzer_ops' It seems that the code was written to deal with varying contents of the arrray, but the actual #ifdef was missing. Both tc_cls_act_analyzer_ops and xdp_analyzer_ops are defined in the core networking code, so adding a check for CONFIG_NET seems appropriate here, and I've verified this with many randconfig builds Fixes: 4f9218aaf8a4 ("bpf: move knowledge about post-translation offsets out of verifier") Signed-off-by: Arnd Bergmann Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 530b68550fd2..5f3799dcba01 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4588,8 +4588,10 @@ err_free_env: } static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { +#ifdef CONFIG_NET [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, +#endif }; int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, -- cgit v1.2.3 From eba0c929d1d0f16c4b03628b7bf8ce363b9e5c9a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:05:52 +0100 Subject: bpf: fix out-of-bounds access warning in bpf_check The bpf_verifer_ops array is generated dynamically and may be empty depending on configuration, which then causes an out of bounds access: kernel/bpf/verifier.c: In function 'bpf_check': kernel/bpf/verifier.c:4320:29: error: array subscript is above array bounds [-Werror=array-bounds] This adds a check to the start of the function as a workaround. I would assume that the function is never called in that configuration, so the warning is probably harmless. Fixes: 00176a34d9e2 ("bpf: remove the verifier ops from program structure") Signed-off-by: Arnd Bergmann Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f3799dcba01..ab5aa5497666 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4474,6 +4474,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) struct bpf_verifer_log *log; int ret = -EINVAL; + /* no program is valid */ + if (ARRAY_SIZE(bpf_verifier_ops) == 0) + return -EINVAL; + /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ -- cgit v1.2.3 From 8c01c4f896aa3404af948880dcb29a2d51c833dc Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Thu, 2 Nov 2017 11:18:01 -0400 Subject: bpf: fix verifier NULL pointer dereference do_check() can fail early without allocating env->cur_state under memory pressure. Syzkaller found the stack below on the linux-next tree because of this. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 27062 Comm: syz-executor5 Not tainted 4.14.0-rc7+ #106 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801c2c74700 task.stack: ffff8801c3e28000 RIP: 0010:free_verifier_state kernel/bpf/verifier.c:347 [inline] RIP: 0010:bpf_check+0xcf4/0x19c0 kernel/bpf/verifier.c:4533 RSP: 0018:ffff8801c3e2f5c8 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 00000000fffffff4 RCX: 0000000000000000 RDX: 0000000000000070 RSI: ffffffff817d5aa9 RDI: 0000000000000380 RBP: ffff8801c3e2f668 R08: 0000000000000000 R09: 1ffff100387c5d9f R10: 00000000218c4e80 R11: ffffffff85b34380 R12: ffff8801c4dc6a28 R13: 0000000000000000 R14: ffff8801c4dc6a00 R15: ffff8801c4dc6a20 FS: 00007f311079b700(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004d4a24 CR3: 00000001cbcd0000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: bpf_prog_load+0xcbb/0x18e0 kernel/bpf/syscall.c:1166 SYSC_bpf kernel/bpf/syscall.c:1690 [inline] SyS_bpf+0xae9/0x4620 kernel/bpf/syscall.c:1652 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x452869 RSP: 002b:00007f311079abe8 EFLAGS: 00000212 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 0000000000758020 RCX: 0000000000452869 RDX: 0000000000000030 RSI: 0000000020168000 RDI: 0000000000000005 RBP: 00007f311079aa20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000212 R12: 00000000004b7550 R13: 00007f311079ab58 R14: 00000000004b7560 R15: 0000000000000000 Code: df 48 c1 ea 03 80 3c 02 00 0f 85 e6 0b 00 00 4d 8b 6e 20 48 b8 00 00 00 00 00 fc ff df 49 8d bd 80 03 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 b6 0b 00 00 49 8b bd 80 03 00 00 e8 d6 0c 26 RIP: free_verifier_state kernel/bpf/verifier.c:347 [inline] RSP: ffff8801c3e2f5c8 RIP: bpf_check+0xcf4/0x19c0 kernel/bpf/verifier.c:4533 RSP: ffff8801c3e2f5c8 ---[ end trace c8d37f339dc64004 ]--- Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption") Fixes: 1969db47f8d0 ("bpf: fix verifier memory leaks") Signed-off-by: Craig Gallek Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ab5aa5497666..04357ad5a812 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4534,8 +4534,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } skip_full_check: while (!pop_stack(env, NULL, NULL)); @@ -4643,8 +4645,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } skip_full_check: while (!pop_stack(env, NULL, NULL)); -- cgit v1.2.3 From edbfd9112f70c34b2965580a67dad5fb306fb6c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Nov 2017 07:02:15 -0700 Subject: Revert "workqueue: respect isolated cpus when queueing an unbound work" This reverts commit b5149873a0c299195b5346fe4dc2c5b04ae2f995. It conflicts with the following isolcpus change from the sched branch. edb9382175c3 ("sched/isolation: Move isolcpus= handling to the housekeeping code") Let's revert for now. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bfa433b38a61..64d0edf428f8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4980,10 +4980,6 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; - /* - * Not excluding isolated cpus on purpose. - * If the user wishes to include them, we allow that. - */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5583,7 +5579,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map); + cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.2.3 From 2d2123bc7c7f843aa9db87720de159a049839862 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 31 Oct 2017 15:51:14 +0000 Subject: arm64/sve: Add prctl controls for userspace vector length management MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds two arm64-specific prctls, to permit userspace to control its vector length: * PR_SVE_SET_VL: set the thread's SVE vector length and vector length inheritance mode. * PR_SVE_GET_VL: get the same information. Although these prctls resemble instruction set features in the SVE architecture, they provide additional control: the vector length inheritance mode is Linux-specific and nothing to do with the architecture, and the architecture does not permit EL0 to set its own vector length directly. Both can be used in portable tools without requiring the use of SVE instructions. Signed-off-by: Dave Martin Reviewed-by: Catalin Marinas Cc: Alex Bennée [will: Fixed up prctl constants to avoid clash with PDEATHSIG] Signed-off-by: Will Deacon --- kernel/sys.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 9aebc2935013..c541916b38c6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -110,6 +110,12 @@ #ifndef SET_FP_MODE # define SET_FP_MODE(a,b) (-EINVAL) #endif +#ifndef SVE_SET_VL +# define SVE_SET_VL(a) (-EINVAL) +#endif +#ifndef SVE_GET_VL +# define SVE_GET_VL() (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2385,6 +2391,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; + case PR_SVE_SET_VL: + error = SVE_SET_VL(arg2); + break; + case PR_SVE_GET_VL: + error = SVE_GET_VL(); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From d62d813c0d714a2d0aaf3d796a7a51ae60bf5470 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 3 Nov 2017 13:36:42 +0000 Subject: cpufreq: schedutil: Examine the correct CPU when we update util After commit 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks) we stopped to always read the utilization for the CPU we are running the governor on, and instead we read it for the CPU which we've been told has updated utilization. This is stored in sugov_cpu->cpu. The value is set in sugov_register() but we clear it in sugov_start() which leads to always looking at the utilization of CPU0 instead of the correct one. Fix this by consolidating the initialization code into sugov_start(). Fixes: 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks) Signed-off-by: Chris Redpath Reviewed-by: Patrick Bellasi Reviewed-by: Brendan Jackman Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 9209d83ecdcf..ba0da243fdd8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -649,6 +649,7 @@ static int sugov_start(struct cpufreq_policy *policy) struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); memset(sg_cpu, 0, sizeof(*sg_cpu)); + sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; @@ -714,11 +715,6 @@ struct cpufreq_governor *cpufreq_default_governor(void) static int __init sugov_register(void) { - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(sugov_cpu, cpu).cpu = cpu; - return cpufreq_register_governor(&schedutil_gov); } fs_initcall(sugov_register); -- cgit v1.2.3 From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:17 -0700 Subject: bpf: offload: add infrastructure for loading programs for a specific netdev The fact that we don't know which device the program is going to be used on is quite limiting in current eBPF infrastructure. We have to reverse or limit the changes which kernel makes to the loaded bytecode if we want it to be offloaded to a networking device. We also have to invent new APIs for debugging and troubleshooting support. Make it possible to load programs for a specific netdev. This helps us to bring the debug information closer to the core eBPF infrastructure (e.g. we will be able to reuse the verifer log in device JIT). It allows device JITs to perform translation on the original bytecode. __bpf_prog_get() when called to get a reference for an attachment point will now refuse to give it if program has a device assigned. Following patches will add a version of that function which passes the expected netdev in. @type argument in __bpf_prog_get() is renamed to attach_type to make it clearer that it's only set on attachment. All calls to ndo_bpf are protected by rtnl, only verifier callbacks are not. We need a wait queue to make sure netdev doesn't get destroyed while verifier is still running and calling its driver. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 1 + kernel/bpf/core.c | 10 ++- kernel/bpf/offload.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 17 +++-- kernel/bpf/verifier.c | 15 ++++- 5 files changed, 217 insertions(+), 8 deletions(-) create mode 100644 kernel/bpf/offload.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 16e95c8e749e..e691da0b3bab 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7fe448799d76..8a6c37762330 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1380,7 +1380,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ - fp = bpf_int_jit_compile(fp); + if (!bpf_prog_is_dev_bound(fp->aux)) { + fp = bpf_int_jit_compile(fp); + } else { + *err = bpf_prog_offload_compile(fp); + if (*err) + return fp; + } bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -1549,6 +1555,8 @@ static void bpf_prog_free_deferred(struct work_struct *work) struct bpf_prog_aux *aux; aux = container_of(work, struct bpf_prog_aux, work); + if (bpf_prog_is_dev_bound(aux)) + bpf_prog_offload_destroy(aux->prog); bpf_jit_free(aux->prog); } diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c new file mode 100644 index 000000000000..5553e0e2f8b1 --- /dev/null +++ b/kernel/bpf/offload.c @@ -0,0 +1,182 @@ +#include +#include +#include +#include +#include +#include +#include + +/* protected by RTNL */ +static LIST_HEAD(bpf_prog_offload_devs); + +int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_dev_offload *offload; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->prog_flags) + return -EINVAL; + + offload = kzalloc(sizeof(*offload), GFP_USER); + if (!offload) + return -ENOMEM; + + offload->prog = prog; + init_waitqueue_head(&offload->verifier_done); + + rtnl_lock(); + offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex); + if (!offload->netdev) { + rtnl_unlock(); + kfree(offload); + return -EINVAL; + } + + prog->aux->offload = offload; + list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + rtnl_unlock(); + + return 0; +} + +static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, + struct netdev_bpf *data) +{ + struct net_device *netdev = prog->aux->offload->netdev; + + ASSERT_RTNL(); + + if (!netdev) + return -ENODEV; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + data->command = cmd; + + return netdev->netdev_ops->ndo_bpf(netdev, data); +} + +int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +{ + struct netdev_bpf data = {}; + int err; + + data.verifier.prog = env->prog; + + rtnl_lock(); + err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data); + if (err) + goto exit_unlock; + + env->dev_ops = data.verifier.ops; + + env->prog->aux->offload->dev_state = true; + env->prog->aux->offload->verifier_running = true; +exit_unlock: + rtnl_unlock(); + return err; +} + +static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + struct netdev_bpf data = {}; + + data.offload.prog = prog; + + if (offload->verifier_running) + wait_event(offload->verifier_done, !offload->verifier_running); + + if (offload->dev_state) + WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + + offload->dev_state = false; + list_del_init(&offload->offloads); + offload->netdev = NULL; +} + +void bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + + offload->verifier_running = false; + wake_up(&offload->verifier_done); + + rtnl_lock(); + __bpf_prog_offload_destroy(prog); + rtnl_unlock(); + + kfree(offload); +} + +static int bpf_prog_offload_translate(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + struct netdev_bpf data = {}; + int ret; + + data.offload.prog = prog; + + offload->verifier_running = false; + wake_up(&offload->verifier_done); + + rtnl_lock(); + ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); + rtnl_unlock(); + + return ret; +} + +static unsigned int bpf_prog_warn_on_exec(const void *ctx, + const struct bpf_insn *insn) +{ + WARN(1, "attempt to execute device eBPF program on the host!"); + return 0; +} + +int bpf_prog_offload_compile(struct bpf_prog *prog) +{ + prog->bpf_func = bpf_prog_warn_on_exec; + + return bpf_prog_offload_translate(prog); +} + +const struct bpf_prog_ops bpf_offload_prog_ops = { +}; + +static int bpf_offload_notification(struct notifier_block *notifier, + ulong event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_dev_offload *offload, *tmp; + + ASSERT_RTNL(); + + switch (event) { + case NETDEV_UNREGISTER: + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, + offloads) { + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block bpf_offload_notifier = { + .notifier_call = bpf_offload_notification, +}; + +static int __init bpf_offload_init(void) +{ + register_netdevice_notifier(&bpf_offload_notifier); + return 0; +} + +subsys_initcall(bpf_offload_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 323be2473c4b..1574b9f0f24e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -824,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) return -EINVAL; - prog->aux->ops = bpf_prog_types[type]; + if (!bpf_prog_is_dev_bound(prog->aux)) + prog->aux->ops = bpf_prog_types[type]; + else + prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; return 0; } @@ -1054,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1062,7 +1065,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (type && prog->type != *type) { + if (attach_type && (prog->type != *attach_type || prog->aux->offload)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1089,7 +1092,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_name +#define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex static int bpf_prog_load(union bpf_attr *attr) { @@ -1152,6 +1155,12 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; + if (attr->prog_target_ifindex) { + err = bpf_prog_offload_init(prog, attr); + if (err) + goto free_prog; + } + /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04357ad5a812..51aabb32ad67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3736,10 +3736,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) - return 0; + if (env->analyzer_ops && env->analyzer_ops->insn_hook) + return env->analyzer_ops->insn_hook(env, insn_idx, + prev_insn_idx); + if (env->dev_ops && env->dev_ops->insn_hook) + return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); - return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); + return 0; } static int do_check(struct bpf_verifier_env *env) @@ -4516,6 +4519,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; + if (env->prog->aux->offload) { + ret = bpf_prog_offload_verifier_prep(env); + if (ret) + goto err_unlock; + } + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:18 -0700 Subject: bpf: report offload info to user space Extend struct bpf_prog_info to contain information about program being bound to a device. Since the netdev may get destroyed while program still exists we need a flag to indicate the program is loaded for a device, even if the device is gone. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/offload.c | 12 ++++++++++++ kernel/bpf/syscall.c | 5 +++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 5553e0e2f8b1..2816feb38be1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -144,6 +144,18 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +u32 bpf_prog_offload_ifindex(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + u32 ifindex; + + rtnl_lock(); + ifindex = offload->netdev ? offload->netdev->ifindex : 0; + rtnl_unlock(); + + return ifindex; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1574b9f0f24e..3217c20ea91b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1592,6 +1592,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + info.status |= BPF_PROG_STATUS_DEV_BOUND; + info.ifindex = bpf_prog_offload_ifindex(prog); + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit v1.2.3 From 248f346ffe9508dee0039db4ac839cb31ba3bdec Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:20 -0700 Subject: xdp: allow attaching programs loaded for specific device Pass the netdev pointer to bpf_prog_get_type(). This way BPF code can decide whether the device matches what the code was loaded/translated for. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3217c20ea91b..68f9123acd39 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,7 +1057,22 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) +static bool bpf_prog_can_attach(struct bpf_prog *prog, + enum bpf_prog_type *attach_type, + struct net_device *netdev) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + + if (prog->type != *attach_type) + return false; + if (offload && offload->netdev != netdev) + return false; + + return true; +} + +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, + struct net_device *netdev) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1065,7 +1080,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (attach_type && (prog->type != *attach_type || prog->aux->offload)) { + if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1078,12 +1093,12 @@ out: struct bpf_prog *bpf_prog_get(u32 ufd) { - return __bpf_prog_get(ufd, NULL); + return __bpf_prog_get(ufd, NULL, NULL); } struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); @@ -1091,6 +1106,16 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) } EXPORT_SYMBOL_GPL(bpf_prog_get_type); +struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, + struct net_device *netdev) +{ + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev); + + if (!IS_ERR(prog)) + trace_bpf_prog_get_type(prog); + return prog; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex -- cgit v1.2.3 From 6c8dfe21c435cf2953e3cee43e12180cbc4f0820 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:21 -0700 Subject: cls_bpf: allow attaching programs loaded for specific device If TC program is loaded with skip_sw flag, we should allow the device-specific programs to be accepted. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 68f9123acd39..416d70cdfc76 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1115,6 +1115,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, trace_bpf_prog_get_type(prog); return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex -- cgit v1.2.3 From b37a530613104aa3f592376c67a462823298759c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:30 -0700 Subject: bpf: remove old offload/analyzer Thanks to the ability to load a program for a specific device, running verifier twice is no longer needed. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 75 --------------------------------------------------- 1 file changed, 75 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 51aabb32ad67..add845fe788a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -949,9 +949,6 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (env->analyzer_ops) - return 0; - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -3736,9 +3733,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - if (env->analyzer_ops && env->analyzer_ops->insn_hook) - return env->analyzer_ops->insn_hook(env, insn_idx, - prev_insn_idx); if (env->dev_ops && env->dev_ops->insn_hook) return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); @@ -4601,72 +4595,3 @@ err_free_env: kfree(env); return ret; } - -static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { -#ifdef CONFIG_NET - [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, - [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, -#endif -}; - -int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, - void *priv) -{ - struct bpf_verifier_env *env; - int ret; - - if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) || - !bpf_analyzer_ops[prog->type]) - return -EOPNOTSUPP; - - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); - if (!env) - return -ENOMEM; - - env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * - prog->len); - ret = -ENOMEM; - if (!env->insn_aux_data) - goto err_free_env; - env->prog = prog; - env->ops = bpf_analyzer_ops[env->prog->type]; - env->analyzer_ops = ops; - env->analyzer_priv = priv; - - /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); - - env->strict_alignment = false; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - env->strict_alignment = true; - - env->explored_states = kcalloc(env->prog->len, - sizeof(struct bpf_verifier_state_list *), - GFP_KERNEL); - ret = -ENOMEM; - if (!env->explored_states) - goto skip_full_check; - - ret = check_cfg(env); - if (ret < 0) - goto skip_full_check; - - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - - ret = do_check(env); - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } - -skip_full_check: - while (!pop_stack(env, NULL, NULL)); - free_states(env); - - mutex_unlock(&bpf_verifier_lock); - vfree(env->insn_aux_data); -err_free_env: - kfree(env); - return ret; -} -EXPORT_SYMBOL_GPL(bpf_analyzer); -- cgit v1.2.3 From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sun, 5 Nov 2017 08:15:32 -0500 Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2 Cgroup v2 lacks the device controller, provided by cgroup v1. This patch adds a new eBPF program type, which in combination of previously added ability to attach multiple eBPF programs to a cgroup, will provide a similar functionality, but with some additional flexibility. This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type. A program takes major and minor device numbers, device type (block/character) and access type (mknod/read/write) as parameters and returns an integer which defines if the operation should be allowed or terminated with -EPERM. Signed-off-by: Roman Gushchin Acked-by: Alexei Starovoitov Acked-by: Tejun Heo Cc: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 7 ++++++ kernel/bpf/verifier.c | 1 + 3 files changed, 75 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 3db5a17fcfe8..b789ab78d28f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -522,3 +522,70 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); + +int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, + short access, enum bpf_attach_type type) +{ + struct cgroup *cgrp; + struct bpf_cgroup_dev_ctx ctx = { + .access_type = (access << 16) | dev_type, + .major = major, + .minor = minor, + }; + int allow = 1; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN); + rcu_read_unlock(); + + return !allow; +} +EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); + +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_trace_printk: + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); + default: + return NULL; + } +} + +static bool cgroup_dev_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +const struct bpf_prog_ops cg_dev_prog_ops = { +}; + +const struct bpf_verifier_ops cg_dev_verifier_ops = { + .get_func_proto = cgroup_dev_func_proto, + .is_valid_access = cgroup_dev_is_valid_access, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 416d70cdfc76..09badc37e864 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1326,6 +1326,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, true); @@ -1378,6 +1381,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, false); @@ -1420,6 +1426,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_DEVICE: break; default: return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index add845fe788a..4a942e2e753d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3124,6 +3124,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: break; default: return 0; -- cgit v1.2.3 From 9a19b463863e757e649c37af245b6af101410c1e Mon Sep 17 00:00:00 2001 From: Wang Long Date: Thu, 2 Nov 2017 23:05:12 -0400 Subject: workqueue: Fix comment for unbound workqueue's attrbutes Signed-off-by: Wang Long Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..5f99851bff09 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5013,9 +5013,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) * * Unbound workqueues have the following extra attributes. * - * id RO int : the associated pool ID + * pool_ids RO int : the associated pool IDs for each node * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers + * numa RW bool : whether enable NUMA affinity */ struct wq_device { struct workqueue_struct *wq; -- cgit v1.2.3 From 01ee6cfb1483fe57c9cbd8e73817dfbf9bacffd3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 6 Nov 2017 13:30:28 -0500 Subject: cgroup: export list of delegatable control files using sysfs Delegatable cgroup v2 control files may require special handling (e.g. chowning), and the exact list of such files varies between kernel versions (and likely to be extended in the future). To guarantee correctness of this list and simplify the life of userspace (systemd, first of all), let's export the list via /sys/kernel/cgroup/delegate pseudo-file. Format is siple: each control file name is printed on a new line. Example: $ cat /sys/kernel/cgroup/delegate cgroup.procs cgroup.subtree_control Signed-off-by: Roman Gushchin Cc: Tejun Heo Cc: kernel-team@fb.com Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6ed725f36d9..eed92ed624e5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5832,3 +5832,64 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, return ret; } #endif /* CONFIG_CGROUP_BPF */ + +#ifdef CONFIG_SYSFS +static ssize_t show_delegatable_files(struct cftype *files, char *buf, + ssize_t size, const char *prefix) +{ + struct cftype *cft; + ssize_t ret = 0; + + for (cft = files; cft && cft->name[0] != '\0'; cft++) { + if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) + continue; + + if (prefix) + ret += snprintf(buf + ret, size - ret, "%s.", prefix); + + ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); + + if (unlikely(ret >= size)) { + WARN_ON(1); + break; + } + } + + return ret; +} + +static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct cgroup_subsys *ss; + int ssid; + ssize_t ret = 0; + + ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, + NULL); + + for_each_subsys(ss, ssid) + ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, + PAGE_SIZE - ret, + cgroup_subsys_name[ssid]); + + return ret; +} +static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); + +static struct attribute *cgroup_sysfs_attrs[] = { + &cgroup_delegate_attr.attr, + NULL, +}; + +static const struct attribute_group cgroup_sysfs_attr_group = { + .attrs = cgroup_sysfs_attrs, + .name = "cgroup", +}; + +static int __init cgroup_sysfs_init(void) +{ + return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); +} +subsys_initcall(cgroup_sysfs_init); +#endif /* CONFIG_SYSFS */ -- cgit v1.2.3 From 5f2e673405b742be64e7c3604ed4ed3ac14f35ce Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 6 Nov 2017 13:30:29 -0500 Subject: cgroup: export list of cgroups v2 features using sysfs The active development of cgroups v2 sometimes leads to a creation of interfaces, which are not turned on by default (to provide backward compatibility). It's handy to know from userspace, which cgroup v2 features are supported without calculating it based on the kernel version. So, let's export the list of such features using /sys/kernel/cgroup/features pseudo-file. The list is hardcoded and has to be extended when new functionality is added. Each feature is printed on a new line. Example: $ cat /sys/kernel/cgroup/features nsdelegate Signed-off-by: Roman Gushchin Cc: Tejun Heo Cc: kernel-team@fb.com Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index eed92ed624e5..69e65d28fe98 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5877,8 +5877,16 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, } static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); +static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); +} +static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); + static struct attribute *cgroup_sysfs_attrs[] = { &cgroup_delegate_attr.attr, + &cgroup_features_attr.attr, NULL, }; -- cgit v1.2.3 From e846d13958066828a9483d862cc8370a72fadbb6 Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Thu, 2 Nov 2017 09:18:21 +0800 Subject: kprobes, x86/alternatives: Use text_mutex to protect smp_alt_modules We use alternatives_text_reserved() to check if the address is in the fixed pieces of alternative reserved, but the problem is that we don't hold the smp_alt mutex when call this function. So the list traversal may encounter a deleted list_head if another path is doing alternatives_smp_module_del(). One solution is that we can hold smp_alt mutex before call this function, but the difficult point is that the callers of this functions, arch_prepare_kprobe() and arch_prepare_optimized_kprobe(), are called inside the text_mutex. So we must hold smp_alt mutex before we go into these arch dependent code. But we can't now, the smp_alt mutex is the arch dependent part, only x86 has it. Maybe we can export another arch dependent callback to solve this. But there is a simpler way to handle this problem. We can reuse the text_mutex to protect smp_alt_modules instead of using another mutex. And all the arch dependent checks of kprobes are inside the text_mutex, so it's safe now. Signed-off-by: Zhou Chengming Reviewed-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Fixes: 2cfa197 "ftrace/alternatives: Introducing *_text_reserved functions" Link: http://lkml.kernel.org/r/1509585501-79466-1-git-send-email-zhouchengming1@huawei.com Signed-off-by: Ingo Molnar --- kernel/extable.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 9aa1cc41ecf7..a17fdb63dc3e 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -31,6 +31,8 @@ * mutex protecting text section modification (dynamic code patching). * some users need to sleep (allocating memory...) while they hold this lock. * + * Note: Also protects SMP-alternatives modification on x86. + * * NOT exported to modules - patching kernel text is a really delicate matter. */ DEFINE_MUTEX(text_mutex); -- cgit v1.2.3 From f791dd2589d7e217625d7e411621a5bac71cbf69 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 3 Nov 2017 18:59:48 +0800 Subject: locking/rwlocks: Fix comments - fix the list of locking API headers in kernel/locking/spinlock.c - fix an #endif comment Signed-off-by: Cheng Jian Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: huawei.libin@huawei.com Cc: xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1509706788-152547-1-git-send-email-cj.chengjian@huawei.com Signed-off-by: Ingo Molnar --- kernel/locking/spinlock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index b96343374a87..1fd1a7543cdd 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -30,7 +30,8 @@ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) /* * The __lock_function inlines are taken from - * include/linux/spinlock_api_smp.h + * spinlock : include/linux/spinlock_api_smp.h + * rwlock : include/linux/rwlock_api_smp.h */ #else -- cgit v1.2.3 From 4ac2aed837cbdbb21c12a28c04718e34c1dc225f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:50 -0500 Subject: resource: Consolidate resource walking code The walk_iomem_res_desc(), walk_system_ram_res() and walk_system_ram_range() functions each have much of the same code. Create a new function that consolidates the common code from these functions in one place to reduce the amount of duplicated code. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: kvm@vger.kernel.org Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171020143059.3291-9-brijesh.singh@amd.com --- kernel/resource.c | 52 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 9b5f04404152..7323c1b636cd 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -400,6 +400,26 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, return 0; } +static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, + bool first_level_children_only, + void *arg, int (*func)(u64, u64, void *)) +{ + u64 orig_end = res->end; + int ret = -1; + + while ((res->start < res->end) && + !find_next_iomem_res(res, desc, first_level_children_only)) { + ret = (*func)(res->start, res->end, arg); + if (ret) + break; + + res->start = res->end + 1; + res->end = orig_end; + } + + return ret; +} + /* * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. @@ -418,26 +438,12 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) { struct resource res; - u64 orig_end; - int ret = -1; res.start = start; res.end = end; res.flags = flags; - orig_end = res.end; - - while ((res.start < res.end) && - (!find_next_iomem_res(&res, desc, false))) { - - ret = (*func)(res.start, res.end, arg); - if (ret) - break; - - res.start = res.end + 1; - res.end = orig_end; - } - return ret; + return __walk_iomem_res_desc(&res, desc, false, arg, func); } /* @@ -451,22 +457,13 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) { struct resource res; - u64 orig_end; - int ret = -1; res.start = start; res.end = end; res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - orig_end = res.end; - while ((res.start < res.end) && - (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) { - ret = (*func)(res.start, res.end, arg); - if (ret) - break; - res.start = res.end + 1; - res.end = orig_end; - } - return ret; + + return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + arg, func); } #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) @@ -508,6 +505,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; } + /* * This generic page_is_ram() returns true if specified address is * registered as System RAM in iomem_resource list. -- cgit v1.2.3 From 1d2e733b13b450e5854f4a8f8efcd77fa7362d62 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:51 -0500 Subject: resource: Provide resource struct in resource walk callback In preperation for a new function that will need additional resource information during the resource walk, update the resource walk callback to pass the resource structure. Since the current callback start and end arguments are pulled from the resource structure, the callback functions can obtain them from the resource structure directly. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: kvm@vger.kernel.org Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: linuxppc-dev@lists.ozlabs.org Link: https://lkml.kernel.org/r/20171020143059.3291-10-brijesh.singh@amd.com --- kernel/kexec_file.c | 5 +++-- kernel/resource.c | 9 +++++---- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 9f48f4412297..e5bcd94c1efb 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -406,9 +406,10 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, return 1; } -static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +static int locate_mem_hole_callback(struct resource *res, void *arg) { struct kexec_buf *kbuf = (struct kexec_buf *)arg; + u64 start = res->start, end = res->end; unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ @@ -437,7 +438,7 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg) * func returning non-zero, then zero will be returned. */ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(u64, u64, void *)) + int (*func)(struct resource *, void *)) { if (kbuf->image->type == KEXEC_TYPE_CRASH) return walk_iomem_res_desc(crashk_res.desc, diff --git a/kernel/resource.c b/kernel/resource.c index 7323c1b636cd..8430042fa77b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -402,14 +402,15 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, bool first_level_children_only, - void *arg, int (*func)(u64, u64, void *)) + void *arg, + int (*func)(struct resource *, void *)) { u64 orig_end = res->end; int ret = -1; while ((res->start < res->end) && !find_next_iomem_res(res, desc, first_level_children_only)) { - ret = (*func)(res->start, res->end, arg); + ret = (*func)(res, arg); if (ret) break; @@ -435,7 +436,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, * and set it in 'desc' of a target resource entry. */ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, - u64 end, void *arg, int (*func)(u64, u64, void *)) + u64 end, void *arg, int (*func)(struct resource *, void *)) { struct resource res; @@ -454,7 +455,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, - int (*func)(u64, u64, void *)) + int (*func)(struct resource *, void *)) { struct resource res; -- cgit v1.2.3 From 0e4c12b45aa88e74fdda117896d2b61c4e510cb9 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Oct 2017 09:30:52 -0500 Subject: x86/mm, resource: Use PAGE_KERNEL protection for ioremap of memory pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order for memory pages to be properly mapped when SEV is active, it's necessary to use the PAGE_KERNEL protection attribute as the base protection. This ensures that memory mapping of, e.g. ACPI tables, receives the proper mapping attributes. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Laura Abbott Cc: Kees Cook Cc: kvm@vger.kernel.org Cc: Jérôme Glisse Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Andrew Morton Cc: Dan Williams Cc: "Kirill A. Shutemov" Link: https://lkml.kernel.org/r/20171020143059.3291-11-brijesh.singh@amd.com --- kernel/resource.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 8430042fa77b..54ba6de3757c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -397,6 +397,8 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, res->start = p->start; if (res->end > p->end) res->end = p->end; + res->flags = p->flags; + res->desc = p->desc; return 0; } @@ -467,6 +469,23 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, arg, func); } +/* + * This function calls the @func callback against all memory ranges, which + * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY. + */ +int walk_mem_res(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + struct resource res; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + arg, func); +} + #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* -- cgit v1.2.3 From 11752adb68a388724b1935d57bf543897c34d80b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 7 Nov 2017 16:18:06 -0500 Subject: locking/pvqspinlock: Implement hybrid PV queued/unfair locks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, all the lock waiters entering the slowpath will do one lock stealing attempt to acquire the lock. That helps performance, especially in VMs with over-committed vCPUs. However, the current pvqspinlocks still don't perform as good as unfair locks in many cases. On the other hands, unfair locks do have the problem of lock starvation that pvqspinlocks don't have. This patch combines the best attributes of an unfair lock and a pvqspinlock into a hybrid lock with 2 modes - queued mode & unfair mode. A lock waiter goes into the unfair mode when there are waiters in the wait queue but the pending bit isn't set. Otherwise, it will go into the queued mode waiting in the queue for its turn. On a 2-socket 36-core E5-2699 v3 system (HT off), a kernel build (make -j) was done in a VM with unpinned vCPUs 3 times with the best time selected and is the number of vCPUs available. The build times of the original pvqspinlock, hybrid pvqspinlock and unfair lock with various number of vCPUs are as follows: vCPUs pvqlock hybrid pvqlock unfair lock ----- ------- -------------- ----------- 30 342.1s 329.1s 329.1s 36 314.1s 305.3s 307.3s 45 345.0s 302.1s 306.6s 54 365.4s 308.6s 307.8s 72 358.9s 293.6s 303.9s 108 343.0s 285.9s 304.2s The hybrid pvqspinlock performs better or comparable to the unfair lock. By turning on QUEUED_LOCK_STAT, the table below showed the number of lock acquisitions in unfair mode and queue mode after a kernel build with various number of vCPUs. vCPUs queued mode unfair mode ----- ----------- ----------- 30 9,130,518 294,954 36 10,856,614 386,809 45 8,467,264 11,475,373 54 6,409,987 19,670,855 72 4,782,063 25,712,180 It can be seen that as the VM became more and more over-committed, the ratio of locks acquired in unfair mode increases. This is all done automatically to get the best overall performance as possible. Using a kernel locking microbenchmark with number of locking threads equals to the number of vCPUs available on the same machine, the minimum, average and maximum (min/avg/max) numbers of locking operations done per thread in a 5-second testing interval are shown below: vCPUs hybrid pvqlock unfair lock ----- -------------- ----------- 36 822,135/881,063/950,363 75,570/313,496/ 690,465 54 542,435/581,664/625,937 35,460/204,280/ 457,172 72 397,500/428,177/499,299 17,933/150,679/ 708,001 108 257,898/288,150/340,871 3,085/181,176/1,257,109 It can be seen that the hybrid pvqspinlocks are more fair and performant than the unfair locks in this test. The table below shows the kernel build times on a smaller 2-socket 16-core 32-thread E5-2620 v4 system. vCPUs pvqlock hybrid pvqlock unfair lock ----- ------- -------------- ----------- 16 436.8s 433.4s 435.6s 36 366.2s 364.8s 364.5s 48 423.6s 376.3s 370.2s 64 433.1s 376.6s 376.8s Again, the performance of the hybrid pvqspinlock was comparable to that of the unfair lock. Signed-off-by: Waiman Long Reviewed-by: Juergen Gross Reviewed-by: Eduardo Valentin Acked-by: Peter Zijlstra Cc: Boris Ostrovsky Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1510089486-3466-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 47 ++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 15b6a39366c6..6ee477765e6c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -61,21 +61,50 @@ struct pv_node { #include "qspinlock_stat.h" /* + * Hybrid PV queued/unfair lock + * * By replacing the regular queued_spin_trylock() with the function below, * it will be called once when a lock waiter enter the PV slowpath before - * being queued. By allowing one lock stealing attempt here when the pending - * bit is off, it helps to reduce the performance impact of lock waiter - * preemption without the drawback of lock starvation. + * being queued. + * + * The pending bit is set by the queue head vCPU of the MCS wait queue in + * pv_wait_head_or_lock() to signal that it is ready to spin on the lock. + * When that bit becomes visible to the incoming waiters, no lock stealing + * is allowed. The function will return immediately to make the waiters + * enter the MCS wait queue. So lock starvation shouldn't happen as long + * as the queued mode vCPUs are actively running to set the pending bit + * and hence disabling lock stealing. + * + * When the pending bit isn't set, the lock waiters will stay in the unfair + * mode spinning on the lock unless the MCS wait queue is empty. In this + * case, the lock waiters will enter the queued mode slowpath trying to + * become the queue head and set the pending bit. + * + * This hybrid PV queued/unfair lock combines the best attributes of a + * queued lock (no lock starvation) and an unfair lock (good performance + * on not heavily contended locks). */ -#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) -static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l) +static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; - if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { - qstat_inc(qstat_pv_lock_stealing, true); - return true; + /* + * Stay in unfair lock mode as long as queued mode waiters are + * present in the MCS wait queue but the pending bit isn't set. + */ + for (;;) { + int val = atomic_read(&lock->val); + + if (!(val & _Q_LOCKED_PENDING_MASK) && + (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + qstat_inc(qstat_pv_lock_stealing, true); + return true; + } + if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) + break; + + cpu_relax(); } return false; -- cgit v1.2.3 From f71b74bca637fca7de78f1d44eaefde5bc900f9f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:18 +0100 Subject: irq/softirqs: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-3-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/softirq.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 4e09821f9d9e..662f7b1b7a78 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -137,7 +137,7 @@ EXPORT_SYMBOL(__local_bh_disable_ip); static void __local_bh_enable(unsigned int cnt) { - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_on(_RET_IP_); @@ -158,7 +158,8 @@ EXPORT_SYMBOL(_local_bh_enable); void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { - WARN_ON_ONCE(in_irq() || irqs_disabled()); + WARN_ON_ONCE(in_irq()); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_disable(); #endif @@ -396,9 +397,8 @@ void irq_exit(void) #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); #else - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); #endif - account_irq_exit_time(current); preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) @@ -488,7 +488,7 @@ EXPORT_SYMBOL(__tasklet_hi_schedule); void __tasklet_hi_schedule_first(struct tasklet_struct *t) { - BUG_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); t->next = __this_cpu_read(tasklet_hi_vec.head); __this_cpu_write(tasklet_hi_vec.head, t); -- cgit v1.2.3 From 8e8eb730759f9cfd7a761b0b4ee41d714e720993 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:19 +0100 Subject: workqueue: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Acked-by: Tejun Heo Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1509980490-4285-4-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1070b21ba4aa..13f67b5a0a0c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1376,7 +1376,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * queued or lose PENDING. Grabbing PENDING and queueing should * happen with IRQ disabled. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); debug_work_activate(work); -- cgit v1.2.3 From ebf3adbad012b89c4a51a3beae718a587d988a3a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:20 +0100 Subject: timers/nohz: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-5-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c7a899c5ce64..dd4b7b492c9b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -198,7 +198,7 @@ static bool check_tick_dependency(atomic_t *dep) static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; @@ -960,8 +960,7 @@ void tick_nohz_idle_enter(void) { struct tick_sched *ts; - WARN_ON_ONCE(irqs_disabled()); - + lockdep_assert_irqs_enabled(); /* * Update the idle state in the scheduler domain hierarchy * when tick_nohz_stop_sched_tick() is called from the idle loop. -- cgit v1.2.3 From 53bef3fd47f69e40b52c9f9acd3551dfff9f8702 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:21 +0100 Subject: timers/hrtimer: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-6-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88f75f92ef36..d32520840fde 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -758,9 +758,7 @@ void clock_was_set(void) */ void hrtimers_resume(void) { - WARN_ONCE(!irqs_disabled(), - KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - + lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); /* And schedule a retrigger for all others */ -- cgit v1.2.3 From 83efcbd028ad3aec36b5a3882cfa32490c135df7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:22 +0100 Subject: smp/core: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-7-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index c94dd85c8d41..084c8b3a2681 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -213,7 +213,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) call_single_data_t *csd, *csd_next; static bool warned; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); -- cgit v1.2.3 From 164446455a5d3f1402b5a0ea42acce33fd576ed7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:24 +0100 Subject: perf/core: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-9-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b315aebbcc3f..c298847d4b85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -209,7 +209,7 @@ static int event_function(void *info) struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* @@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) @@ -1006,7 +1006,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) struct perf_cpu_context *cpuctx; int rotations = 0; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); rotations = perf_rotate_context(cpuctx); @@ -1093,7 +1093,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) { struct list_head *head = this_cpu_ptr(&active_ctx_list); - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(!list_empty(&ctx->active_ctx_list)); @@ -1102,7 +1102,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) static void perf_event_ctx_deactivate(struct perf_event_context *ctx) { - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(list_empty(&ctx->active_ctx_list)); @@ -3523,7 +3523,7 @@ void perf_event_task_tick(void) struct perf_event_context *ctx, *tmp; int throttled; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); -- cgit v1.2.3 From a934d4d15f040cdd254350e7270b35cc7521e12e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:25 +0100 Subject: irq/timings: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-10-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq/timings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index c8c1d073fbf1..e0923fa4927a 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -264,7 +264,7 @@ u64 irq_timings_next_event(u64 now) * order to prevent the timings circular buffer to be updated * while we are reading it. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); /* * Number of elements in the circular buffer: If it happens it -- cgit v1.2.3 From 3c7169a3bf8216a56761a8edf775072dd36a00a0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:26 +0100 Subject: irq_work: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-11-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index bcf107ce0854..899579657a0a 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -188,7 +188,7 @@ void irq_work_tick(void) */ void irq_work_sync(struct irq_work *work) { - WARN_ON_ONCE(irqs_disabled()); + lockdep_assert_irqs_enabled(); while (work->flags & IRQ_WORK_BUSY) cpu_relax(); -- cgit v1.2.3 From 2c11dba00a39007b457c7607c1b1a4db95ca04bc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:27 +0100 Subject: sched/clock, sched/cputime: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-12-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 2 +- kernel/sched/cputime.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index ca0f8fc945c6..e086babe6c61 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -388,7 +388,7 @@ void sched_clock_tick(void) if (unlikely(!sched_clock_running)) return; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); scd = this_scd(); __scd_stamp(scd); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 14d2dbf97c53..9be8b68a66da 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -259,8 +259,7 @@ static inline u64 account_other_time(u64 max) { u64 accounted; - /* Shall be converted to a lockdep-enabled lightweight check */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); accounted = steal_account_process_time(max); -- cgit v1.2.3 From a69682200db9c2c26594188f81dd2df560af4683 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:28 +0100 Subject: timers/posix-cpu-timers: Use lockdep to assert IRQs are disabled/enabled Use lockdep to check that IRQs are enabled or disabled as expected. This way the sanity check only shows overhead when concurrency correctness debug code is enabled. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-13-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/time/posix-cpu-timers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5b117110b55b..1f27887aa194 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -603,7 +603,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, /* * Disarm any old timer after extracting its expiry time. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); ret = 0; old_incr = timer->it.cpu.incr; @@ -1034,7 +1034,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* * Now re-arm for the new expiry time. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); arm_timer(timer); unlock: unlock_task_sighand(p, &flags); @@ -1125,7 +1125,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) struct k_itimer *timer, *next; unsigned long flags; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); /* * The fast path checks that there are no expired thread or thread -- cgit v1.2.3 From b04db8e19fc2e9131524dec43057c1b96d5ba3ba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:30 +0100 Subject: rcu: Use lockdep to assert IRQs are disabled/enabled Lockdep now has an integrated IRQs disabled/enabled sanity check. Just use it instead of the ad-hoc RCU version. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-15-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/rcu/tree.c | 16 ++++++++-------- kernel/rcu/tree_plugin.h | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3e3650e94ae6..08fa586d7f8c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -734,7 +734,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); return READ_ONCE(*fp); } @@ -746,7 +746,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) @@ -773,7 +773,7 @@ static void rcu_eqs_enter_common(bool user) struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { @@ -840,7 +840,7 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rcu_eqs_enter(false); } @@ -855,7 +855,7 @@ void rcu_idle_enter(void) */ void rcu_user_enter(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ @@ -880,7 +880,7 @@ void rcu_irq_exit(void) { struct rcu_dynticks *rdtp; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ @@ -947,7 +947,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -1018,7 +1018,7 @@ void rcu_irq_enter(void) struct rcu_dynticks *rdtp; long long oldval; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..df08e5c8126b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -325,7 +325,7 @@ static void rcu_preempt_note_context_switch(bool preempt) struct rcu_data *rdp; struct rcu_node *rnp; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); + lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -1421,7 +1421,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1470,7 +1470,7 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_is_nocb_cpu(smp_processor_id())) return; @@ -1525,7 +1525,7 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) @@ -2012,7 +2012,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (!rcu_is_nocb_cpu(smp_processor_id())) return false; /* Not NOCBs CPU, caller must migrate CBs. */ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), -- cgit v1.2.3 From fda784e50aace694ec2e4e16e2de07b91a938563 Mon Sep 17 00:00:00 2001 From: "Bruno E. O. Meneguele" Date: Tue, 24 Oct 2017 15:37:00 -0200 Subject: module: export module signature enforcement status A static variable sig_enforce is used as status var to indicate the real value of CONFIG_MODULE_SIG_FORCE, once this one is set the var will hold true, but if the CONFIG is not set the status var will hold whatever value is present in the module.sig_enforce kernel cmdline param: true when =1 and false when =0 or not present. Considering this cmdline param take place over the CONFIG value when it's not set, other places in the kernel could misbehave since they would have only the CONFIG_MODULE_SIG_FORCE value to rely on. Exporting this status var allows the kernel to rely in the effective value of module signature enforcement, being it from CONFIG value or cmdline param. Signed-off-by: Bruno E. O. Meneguele Signed-off-by: Mimi Zohar --- kernel/module.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..d1c194b057a2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -278,6 +278,16 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); module_param(sig_enforce, bool_enable_only, 0644); #endif /* !CONFIG_MODULE_SIG_FORCE */ +/* + * Export sig_enforce kernel cmdline parameter to allow other subsystems rely + * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. + */ +bool is_module_sig_enforced(void) +{ + return sig_enforce; +} +EXPORT_SYMBOL(is_module_sig_enforced); + /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); -- cgit v1.2.3 From c0f3ea1589394deac2d840c685f57c69e4ac4243 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Nov 2017 12:51:04 -0800 Subject: stop using '%pK' for /proc/kallsyms pointer values Not only is it annoying to have one single flag for all pointers, as if that was a global choice and all kernel pointers are the same, but %pK can't get the 'access' vs 'open' time check right anyway. So make the /proc/kallsyms pointer value code use logic specific to that particular file. We do continue to honor kptr_restrict, but the default (which is unrestricted) is changed to instead take expected users into account, and restrict access by default. Right now the only actual expected user is kernel profiling, which has a separate sysctl flag for kernel profile access. There may be others. Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 127e7cfafa55..51b49ed452e4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -480,6 +480,7 @@ struct kallsym_iter { char name[KSYM_NAME_LEN]; char module_name[MODULE_NAME_LEN]; int exported; + int show_value; }; static int get_ksymbol_mod(struct kallsym_iter *iter) @@ -580,14 +581,23 @@ static void s_stop(struct seq_file *m, void *p) { } +#ifndef CONFIG_64BIT +# define KALLSYM_FMT "%08lx" +#else +# define KALLSYM_FMT "%016lx" +#endif + static int s_show(struct seq_file *m, void *p) { + unsigned long value; struct kallsym_iter *iter = m->private; /* Some debugging symbols have no name. Ignore them. */ if (!iter->name[0]) return 0; + value = iter->show_value ? iter->value : 0; + if (iter->module_name[0]) { char type; @@ -597,10 +607,10 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value, type, iter->name, iter->module_name); } else - seq_printf(m, "%pK %c %s\n", (void *)iter->value, + seq_printf(m, KALLSYM_FMT " %c %s\n", value, iter->type, iter->name); return 0; } @@ -612,6 +622,40 @@ static const struct seq_operations kallsyms_op = { .show = s_show }; +static inline int kallsyms_for_perf(void) +{ +#ifdef CONFIG_PERF_EVENTS + extern int sysctl_perf_event_paranoid; + if (sysctl_perf_event_paranoid <= 1) + return 1; +#endif + return 0; +} + +/* + * We show kallsyms information even to normal users if we've enabled + * kernel profiling and are explicitly not paranoid (so kptr_restrict + * is clear, and sysctl_perf_event_paranoid isn't set). + * + * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to + * block even that). + */ +static int kallsyms_show_value(void) +{ + switch (kptr_restrict) { + case 0: + if (kallsyms_for_perf()) + return 1; + /* fallthrough */ + case 1: + if (has_capability_noaudit(current, CAP_SYSLOG)) + return 1; + /* fallthrough */ + default: + return 0; + } +} + static int kallsyms_open(struct inode *inode, struct file *file) { /* @@ -625,6 +669,7 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); + iter->show_value = kallsyms_show_value(); return 0; } -- cgit v1.2.3 From 95b982b45122c57da2ee0b46cce70775e1d987af Mon Sep 17 00:00:00 2001 From: Rajat Jain Date: Tue, 31 Oct 2017 14:44:24 -0700 Subject: PM / s2idle: Clear the events_check_enabled flag Problem: This flag does not get cleared currently in the suspend or resume path in the following cases: * In case some driver's suspend routine returns an error. * Successful s2idle case * etc? Why is this a problem: What happens is that the next suspend attempt could fail even though the user did not enable the flag by writing to /sys/power/wakeup_count. This is 1 use case how the issue can be seen (but similar use case with driver suspend failure can be thought of): 1. Read /sys/power/wakeup_count 2. echo count > /sys/power/wakeup_count 3. echo freeze > /sys/power/wakeup_count 4. Let the system suspend, and wakeup the system using some wake source that calls pm_wakeup_event() e.g. power button or something. 5. Note that the combined wakeup count would be incremented due to the pm_wakeup_event() in the resume path. 6. After resuming the events_check_enabled flag is still set. At this point if the user attempts to freeze again (without writing to /sys/power/wakeup_count), the suspend would fail even though there has been no wake event since the past resume. Address that by clearing the flag just before a resume is completed, so that it is always cleared for the corner cases mentioned above. Signed-off-by: Rajat Jain Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ccd2d20e6b06..0685c4499431 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -437,7 +437,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = suspend_ops->enter(state); trace_suspend_resume(TPS("machine_suspend"), state, false); - events_check_enabled = false; } else if (*wakeup) { error = -EBUSY; } @@ -582,6 +581,7 @@ static int enter_state(suspend_state_t state) pm_restore_gfp_mask(); Finish: + events_check_enabled = false; pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: -- cgit v1.2.3 From 07458f6a5171d97511dfbdf6ce549ed2ca0280c7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 8 Nov 2017 20:23:55 +0530 Subject: cpufreq: schedutil: Reset cached_raw_freq when not in sync with next_freq 'cached_raw_freq' is used to get the next frequency quickly but should always be in sync with sg_policy->next_freq. There is a case where it is not and in such cases it should be reset to avoid switching to incorrect frequencies. Consider this case for example: - policy->cur is 1.2 GHz (Max) - New request comes for 780 MHz and we store that in cached_raw_freq. - Based on 780 MHz, we calculate the effective frequency as 800 MHz. - We then see the CPU wasn't idle recently and choose to keep the next freq as 1.2 GHz. - Now we have cached_raw_freq is 780 MHz and sg_policy->next_freq is 1.2 GHz. - Now if the utilization doesn't change in then next request, then the next target frequency will still be 780 MHz and it will match with cached_raw_freq. But we will choose 1.2 GHz instead of 800 MHz here. Fixes: b7eaf1aab9f8 (cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely) Signed-off-by: Viresh Kumar Cc: 4.12+ # 4.12+ Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index ba0da243fdd8..2f52ec0f1539 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -282,8 +282,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) + if (busy && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; + + /* Reset cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = 0; + } } sugov_update_commit(sg_policy, time, next_f); } -- cgit v1.2.3 From 765cc3a4b224e22bf524fabe40284a524f37cdd0 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 8 Nov 2017 18:41:01 +0000 Subject: sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds When the kernel is compiled with !CONFIG_SCHED_DEBUG support, we expect that all SCHED_FEAT are turned into compile time constants being propagated to support compiler optimizations. Specifically, we expect that code blocks like this: if (sched_feat(FEATURE_NAME) [&& ]) { /* FEATURE CODE */ } are turned into dead-code in case FEATURE_NAME defaults to FALSE, and thus being removed by the compiler from the finale image. For this mechanism to properly work it's required for the compiler to have full access, from each translation unit, to whatever is the value defined by the sched_feat macro. This macro is defined as: #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) and thus, the compiler can optimize that code only if the value of sysctl_sched_features is visible within each translation unit. Since: 029632fbb ("sched: Make separate sched*.c translation units") the scheduler code has been split into separate translation units however the definition of sysctl_sched_features is part of kernel/sched/core.c while, for all the other scheduler modules, it is visible only via kernel/sched/sched.h as an: extern const_debug unsigned int sysctl_sched_features Unfortunately, an extern reference does not allow the compiler to apply constants propagation. Thus, on !CONFIG_SCHED_DEBUG kernel we still end up with code to load a memory reference and (eventually) doing an unconditional jump of a chunk of code. This mechanism is unavoidable when sched_features can be turned on and off at run-time. However, this is not the case for "production" kernels compiled with !CONFIG_SCHED_DEBUG. In this case, sysctl_sched_features is just a constant value which cannot be changed at run-time and thus memory loads and jumps can be avoided altogether. This patch fixes the case of !CONFIG_SCHED_DEBUG kernel by declaring a local version of the sysctl_sched_features constant for each translation unit. This will ultimately allow the compiler to perform constants propagation and dead-code pruning. Tests have been done, with !CONFIG_SCHED_DEBUG on a v4.14-rc8 with and without the patch, by running 30 iterations of: perf bench sched messaging --pipe --thread --group 4 --loop 50000 on a 40 cores Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz using the powersave governor to rule out variations due to frequency scaling. Statistics on the reported completion time: count mean std min 99% max v4.14-rc8 30.0 15.7831 0.176032 15.442 16.01226 16.014 v4.14-rc8+patch 30.0 15.5033 0.189681 15.232 15.93938 15.962 ... show a 1.8% speedup on average completion time and 0.5% speedup in the 99 percentile. Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath Reviewed-by: Dietmar Eggemann Reviewed-by: Brendan Jackman Acked-by: Peter Zijlstra Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20171108184101.16006-1-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++--- kernel/sched/sched.h | 25 ++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a55c842bfbc..a39a08113003 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -43,18 +43,21 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) /* * Debugging: various feature bits + * + * If SCHED_DEBUG is disabled, each compilation unit has its own copy of + * sysctl_sched_features, defined in sched.h, to allow constants propagation + * at compile time and compiler optimization based on features default. */ - #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | - const_debug unsigned int sysctl_sched_features = #include "features.h" 0; - #undef SCHED_FEAT +#endif /* * Number of tasks to iterate in a single balance run. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 58787e3631c7..45ab0bf564e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1233,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) # define const_debug const #endif -extern const_debug unsigned int sysctl_sched_features; - #define SCHED_FEAT(name, enabled) \ __SCHED_FEAT_##name , @@ -1246,6 +1244,13 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) + +/* + * To support run-time toggling of sched features, all the translation units + * (but core.c) reference the sysctl_sched_features defined in core.c. + */ +extern const_debug unsigned int sysctl_sched_features; + #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -1253,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ } #include "features.h" - #undef SCHED_FEAT extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) + #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ + +/* + * Each translation unit has its own copy of sysctl_sched_features to allow + * constants propagation at compile time and compiler optimization based on + * features default. + */ +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | +static const_debug __maybe_unused unsigned int sysctl_sched_features = +#include "features.h" + 0; +#undef SCHED_FEAT + #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; -- cgit v1.2.3 From 4f8413a3a799c958f7a10a6310a451e6b8aef5ad Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 9 Nov 2017 14:17:59 +0000 Subject: genirq: Track whether the trigger type has been set When requesting a shared interrupt, we assume that the firmware support code (DT or ACPI) has called irqd_set_trigger_type already, so that we can retrieve it and check that the requester is being reasonnable. Unfortunately, we still have non-DT, non-ACPI systems around, and these guys won't call irqd_set_trigger_type before requesting the interrupt. The consequence is that we fail the request that would have worked before. We can either chase all these use cases (boring), or address it in core code (easier). Let's have a per-irq_desc flag that indicates whether irqd_set_trigger_type has been called, and let's just check it when checking for a shared interrupt. If it hasn't been set, just take whatever the interrupt requester asks. Fixes: 382bd4de6182 ("genirq: Use irqd_get_trigger_type to compare the trigger type for shared IRQs") Cc: stable@vger.kernel.org Reported-and-tested-by: Petr Cvek Signed-off-by: Marc Zyngier --- kernel/irq/manage.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e667912d0e9c..21e04e780be4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1228,7 +1228,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * set the trigger type must match. Also all must * agree on ONESHOT. */ - unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data); + unsigned int oldtype; + + /* + * If nobody did set the configuration before, inherit + * the one provided by the requester. + */ + if (irqd_trigger_type_was_set(&desc->irq_data)) { + oldtype = irqd_get_trigger_type(&desc->irq_data); + } else { + oldtype = new->flags & IRQF_TRIGGER_MASK; + irqd_set_trigger_type(&desc->irq_data, oldtype); + } if (!((old->flags & new->flags) & IRQF_SHARED) || (oldtype != (new->flags & IRQF_TRIGGER_MASK)) || -- cgit v1.2.3 From 173743dd99a49c956b124a74c8aacb0384739a4c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:44:34 -0400 Subject: audit: ensure that 'audit=1' actually enables audit for PID 1 Prior to this patch we enabled audit in audit_init(), which is too late for PID 1 as the standard initcalls are run after the PID 1 task is forked. This means that we never allocate an audit_context (see audit_alloc()) for PID 1 and therefore miss a lot of audit events generated by PID 1. This patch enables audit as early as possible to help ensure that when PID 1 is forked it can allocate an audit_context if required. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index be1c28fd4d57..ec3d0802734d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -85,13 +85,13 @@ static int audit_initialized; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 -u32 audit_enabled; -u32 audit_ever_enabled; +u32 audit_enabled = AUDIT_OFF; +u32 audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ -static u32 audit_default; +static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; @@ -1549,8 +1549,6 @@ static int __init audit_init(void) register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { @@ -1572,6 +1570,8 @@ static int __init audit_enable(char *str) audit_default = !!simple_strtol(str, NULL, 0); if (!audit_default) audit_initialized = AUDIT_DISABLED; + audit_enabled = audit_default; + audit_ever_enabled = !!audit_enabled; pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); -- cgit v1.2.3 From be4104abf25c63ccef36e1e06262c73c0df9fd60 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:44:44 -0400 Subject: audit: initialize the audit subsystem as early as possible We can't initialize the audit subsystem until after the network layer is initialized (core_initcall), but do it soon after. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index ec3d0802734d..db71c45ea6f8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1562,7 +1562,7 @@ static int __init audit_init(void) return 0; } -__initcall(audit_init); +postcore_initcall(audit_init); /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ static int __init audit_enable(char *str) -- cgit v1.2.3 From 80ab4df62706b882922c3bb0b05ce2c8ab10828a Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:44:51 -0400 Subject: audit: don't use simple_strtol() anymore The simple_strtol() function is deprecated, use kstrtol() instead. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index db71c45ea6f8..e75918f79a83 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1567,8 +1567,13 @@ postcore_initcall(audit_init); /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ static int __init audit_enable(char *str) { - audit_default = !!simple_strtol(str, NULL, 0); - if (!audit_default) + long val; + + if (kstrtol(str, 0, &val)) + panic("audit: invalid 'audit' parameter value (%s)\n", str); + audit_default = (val ? AUDIT_ON : AUDIT_OFF); + + if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; audit_enabled = audit_default; audit_ever_enabled = !!audit_enabled; -- cgit v1.2.3 From b3b4fdf6a8ae9ea51ea274e3f2f8a6a58b98cc0b Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:44:57 -0400 Subject: audit: convert audit_ever_enabled to a boolean We were treating it as a boolean, let's make it a boolean to help avoid future mistakes. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/audit.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e75918f79a83..f0cf9bfc806c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -86,7 +86,7 @@ static int audit_initialized; #define AUDIT_ON 1 #define AUDIT_LOCKED 2 u32 audit_enabled = AUDIT_OFF; -u32 audit_ever_enabled = !!AUDIT_OFF; +bool audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); diff --git a/kernel/audit.h b/kernel/audit.h index b331d9b83f63..6bdaf6bd377e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -208,7 +208,7 @@ struct audit_context { struct audit_proctitle proctitle; }; -extern u32 audit_ever_enabled; +extern bool audit_ever_enabled; extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, -- cgit v1.2.3 From 5d842a5b77a58160625548fa6be2dc159179ffdb Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:45:05 -0400 Subject: audit: use audit_set_enabled() in audit_enable() Use audit_set_enabled() to enable auditing during early boot. This obviously won't emit an audit change record, but it will work anyway and should help prevent in future problems by consolidating the enable/disable code in one function. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f0cf9bfc806c..67b3863261d4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1575,8 +1575,8 @@ static int __init audit_enable(char *str) if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; - audit_enabled = audit_default; - audit_ever_enabled = !!audit_enabled; + if (audit_set_enabled(audit_default)) + panic("audit: error setting audit state (%d)\n", audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); -- cgit v1.2.3 From 33e8a907804428109ce1d12301c3365d619cc4df Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Tue, 17 Oct 2017 18:29:22 -0400 Subject: audit: Allow auditd to set pid to 0 to end auditing The API to end auditing has historically been for auditd to set the pid to 0. This patch restores that functionality. See: https://github.com/linux-audit/audit-kernel/issues/69 Reviewed-by: Richard Guy Briggs Signed-off-by: Steve Grubb Signed-off-by: Paul Moore --- kernel/audit.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 67b3863261d4..64e1d0ec19de 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1197,25 +1197,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid_t auditd_pid; struct pid *req_pid = task_tgid(current); - /* sanity check - PID values must match */ - if (new_pid != pid_vnr(req_pid)) + /* Sanity check - PID values must match. Setting + * pid to 0 is how auditd ends auditing. */ + if (new_pid && (new_pid != pid_vnr(req_pid))) return -EINVAL; /* test the auditd connection */ audit_replace(req_pid); auditd_pid = auditd_pid_vnr(); - /* only the current auditd can unregister itself */ - if ((!new_pid) && (new_pid != auditd_pid)) { - audit_log_config_change("audit_pid", new_pid, - auditd_pid, 0); - return -EACCES; - } - /* replacing a healthy auditd is not allowed */ - if (auditd_pid && new_pid) { - audit_log_config_change("audit_pid", new_pid, - auditd_pid, 0); - return -EEXIST; + if (auditd_pid) { + /* replacing a healthy auditd is not allowed */ + if (new_pid) { + audit_log_config_change("audit_pid", + new_pid, auditd_pid, 0); + return -EEXIST; + } + /* only current auditd can unregister itself */ + if (pid_vnr(req_pid) != auditd_pid) { + audit_log_config_change("audit_pid", + new_pid, auditd_pid, 0); + return -EACCES; + } } if (new_pid) { -- cgit v1.2.3 From f7b53637c090bd8ce2dc74ad0f3aa1898aff2524 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 24 Oct 2017 18:52:31 -0700 Subject: Audit: remove unused audit_log_secctx function The function audit_log_secctx() is unused in the upstream kernel. All it does is wrap another function that doesn't need wrapping. It claims to give you the SELinux context, but that is not true if you are using a different security module. Signed-off-by: Casey Schaufler Reviewed-by: James Morris Signed-off-by: Paul Moore --- kernel/audit.c | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 64e1d0ec19de..227db99b0f19 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2345,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, } } -#ifdef CONFIG_SECURITY -/** - * audit_log_secctx - Converts and logs SELinux context - * @ab: audit_buffer - * @secid: security number - * - * This is a helper function that calls security_secid_to_secctx to convert - * secid to secctx and then adds the (converted) SELinux context to the audit - * log by calling audit_log_format, thus also preventing leak of internal secid - * to userspace. If secid cannot be converted audit_panic is called. - */ -void audit_log_secctx(struct audit_buffer *ab, u32 secid) -{ - u32 len; - char *secctx; - - if (security_secid_to_secctx(secid, &secctx, &len)) { - audit_panic("Cannot convert secid to context"); - } else { - audit_log_format(ab, " obj=%s", secctx); - security_release_secctx(secctx, len); - } -} -EXPORT_SYMBOL(audit_log_secctx); -#endif - EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); -- cgit v1.2.3 From 42d5e37654e4cdb9fb2e2f3ab30045fee35c42d8 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 23 Aug 2017 07:03:39 -0400 Subject: audit: filter PATH records keyed on filesystem magic Tracefs or debugfs were causing hundreds to thousands of PATH records to be associated with the init_module and finit_module SYSCALL records on a few modules when the following rule was in place for startup: -a always,exit -F arch=x86_64 -S init_module -F key=mod-load Provide a method to ignore these large number of PATH records from overwhelming the logs if they are not of interest. Introduce a new filter list "AUDIT_FILTER_FS", with a new field type AUDIT_FSTYPE, which keys off the filesystem 4-octet hexadecimal magic identifier to filter specific filesystem PATH records. An example rule would look like: -a never,filesystem -F fstype=0x74726163 -F key=ignore_tracefs -a never,filesystem -F fstype=0x64626720 -F key=ignore_debugfs Arguably the better way to address this issue is to disable tracefs and debugfs on boot from production systems. See: https://github.com/linux-audit/audit-kernel/issues/16 See: https://github.com/linux-audit/audit-userspace/issues/8 Test case: https://github.com/linux-audit/audit-testsuite/issues/42 Signed-off-by: Richard Guy Briggs [PM: fixed the whitespace damage in kernel/auditsc.c] Signed-off-by: Paul Moore --- kernel/auditfilter.c | 39 ++++++++++++++++++++++++++++++++------- kernel/auditsc.c | 23 +++++++++++++++++++++++ 2 files changed, 55 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0b0aa5854dac..4a1758adb222 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[3]), LIST_HEAD_INIT(audit_filter_list[4]), LIST_HEAD_INIT(audit_filter_list[5]), -#if AUDIT_NR_FILTERS != 6 + LIST_HEAD_INIT(audit_filter_list[6]), +#if AUDIT_NR_FILTERS != 7 #error Fix audit_filter_list initialiser #endif }; @@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_rules_list[3]), LIST_HEAD_INIT(audit_rules_list[4]), LIST_HEAD_INIT(audit_rules_list[5]), + LIST_HEAD_INIT(audit_rules_list[6]), }; DEFINE_MUTEX(audit_filter_mutex); @@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * #endif case AUDIT_FILTER_USER: case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_FS: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { @@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; + case AUDIT_FSTYPE: + if (entry->rule.listnr != AUDIT_FILTER_FS) + return -EINVAL; + break; + } + + switch(entry->rule.listnr) { + case AUDIT_FILTER_FS: + switch(f->type) { + case AUDIT_FSTYPE: + case AUDIT_FILTERKEY: + break; + default: + return -EINVAL; + } } switch(f->type) { @@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return -EINVAL; /* FALL THROUGH */ case AUDIT_ARCH: + case AUDIT_FSTYPE: if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; break; @@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry) #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) + /* If any of these, don't count towards total */ + switch(entry->rule.listnr) { + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_FS: dont_count = 1; + } #endif mutex_lock(&audit_filter_mutex); @@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry) #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) + /* If any of these, don't count towards total */ + switch(entry->rule.listnr) { + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_FS: dont_count = 1; + } #endif mutex_lock(&audit_filter_mutex); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index aac1a41f82bd..c9bb29e17335 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent, struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; + struct audit_entry *e; + struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; + int i; if (!context->in_syscall) return; + rcu_read_lock(); + if (!list_empty(list)) { + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE) { + if (audit_comparator(parent->i_sb->s_magic, + f->op, f->val)) { + if (e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; + } + } + } + } + } + } + rcu_read_unlock(); + if (inode) handle_one(inode); -- cgit v1.2.3 From 1f2cac107c591c24b60b115d6050adc213d10fc0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 5 Nov 2017 09:13:48 -0700 Subject: blktrace: fix unlocked access to init/start-stop/teardown sg.c calls into the blktrace functions without holding the proper queue mutex for doing setup, start/stop, or teardown. Add internal unlocked variants, and export the ones that do the proper locking. Fixes: 6da127ad0918 ("blktrace: Add blktrace ioctls to SCSI generic devices") Tested-by: Dmitry Vyukov Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 58 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 45a3928544ce..ea57dd94b2b2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -336,7 +336,7 @@ static void blk_trace_cleanup(struct blk_trace *bt) blk_unregister_tracepoints(); } -int blk_trace_remove(struct request_queue *q) +static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; @@ -349,6 +349,17 @@ int blk_trace_remove(struct request_queue *q) return 0; } + +int blk_trace_remove(struct request_queue *q) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_remove(q); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_remove); static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, @@ -550,9 +561,8 @@ err: return ret; } -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) +static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; int ret; @@ -571,6 +581,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } return 0; } + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_setup(q, name, dev, bdev, arg); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) @@ -607,7 +630,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, } #endif -int blk_trace_startstop(struct request_queue *q, int start) +static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; struct blk_trace *bt = q->blk_trace; @@ -646,6 +669,17 @@ int blk_trace_startstop(struct request_queue *q, int start) return ret; } + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_startstop(q, start); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_startstop); /* @@ -676,7 +710,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: @@ -687,10 +721,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) case BLKTRACESTART: start = 1; case BLKTRACESTOP: - ret = blk_trace_startstop(q, start); + ret = __blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: - ret = blk_trace_remove(q); + ret = __blk_trace_remove(q); break; default: ret = -ENOTTY; @@ -708,10 +742,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { + mutex_lock(&q->blk_trace_mutex); + if (q->blk_trace) { - blk_trace_startstop(q, 0); - blk_trace_remove(q); + __blk_trace_startstop(q, 0); + __blk_trace_remove(q); } + + mutex_unlock(&q->blk_trace_mutex); } #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From a6da0024ffc19e0d47712bb5ca4fd083f76b07df Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 5 Nov 2017 09:16:09 -0700 Subject: blktrace: fix unlocked registration of tracepoints We need to ensure that tracepoints are registered and unregistered with the users of them. The existing atomic count isn't enough for that. Add a lock around the tracepoints, so we serialize access to them. This fixes cases where we have multiple users setting up and tearing down tracepoints, like this: CPU: 0 PID: 2995 Comm: syzkaller857118 Not tainted 4.14.0-rc5-next-20171018+ #36 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x41c kernel/panic.c:183 __warn+0x1c4/0x1e0 kernel/panic.c:546 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:177 do_trap_no_signal arch/x86/kernel/traps.c:211 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:260 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:297 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:tracepoint_add_func kernel/tracepoint.c:210 [inline] RIP: 0010:tracepoint_probe_register_prio+0x397/0x9a0 kernel/tracepoint.c:283 RSP: 0018:ffff8801d1d1f6c0 EFLAGS: 00010293 RAX: ffff8801d22e8540 RBX: 00000000ffffffef RCX: ffffffff81710f07 RDX: 0000000000000000 RSI: ffffffff85b679c0 RDI: ffff8801d5f19818 RBP: ffff8801d1d1f7c8 R08: ffffffff81710c10 R09: 0000000000000004 R10: ffff8801d1d1f6b0 R11: 0000000000000003 R12: ffffffff817597f0 R13: 0000000000000000 R14: 00000000ffffffff R15: ffff8801d1d1f7a0 tracepoint_probe_register+0x2a/0x40 kernel/tracepoint.c:304 register_trace_block_rq_insert include/trace/events/block.h:191 [inline] blk_register_tracepoints+0x1e/0x2f0 kernel/trace/blktrace.c:1043 do_blk_trace_setup+0xa10/0xcf0 kernel/trace/blktrace.c:542 blk_trace_setup+0xbd/0x180 kernel/trace/blktrace.c:564 sg_ioctl+0xc71/0x2d90 drivers/scsi/sg.c:1089 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x444339 RSP: 002b:00007ffe05bb5b18 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000006d66c0 RCX: 0000000000444339 RDX: 000000002084cf90 RSI: 00000000c0481273 RDI: 0000000000000009 RBP: 0000000000000082 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000206 R12: ffffffffffffffff R13: 00000000c0481273 R14: 0000000000000000 R15: 0000000000000000 since we can now run these in parallel. Ensure that the exported helpers for doing this are grabbing the queue trace mutex. Reported-by: Steven Rostedt Tested-by: Dmitry Vyukov Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ea57dd94b2b2..206e0e2ace53 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = { }; /* Global reference count of probes */ -static atomic_t blk_probes_ref = ATOMIC_INIT(0); +static DEFINE_MUTEX(blk_probe_mutex); +static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); @@ -329,11 +330,26 @@ static void blk_trace_free(struct blk_trace *bt) kfree(bt); } +static void get_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (++blk_probes_ref == 1) + blk_register_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void put_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (!--blk_probes_ref) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + static void blk_trace_cleanup(struct blk_trace *bt) { blk_trace_free(bt); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); + put_probe_ref(); } static int __blk_trace_remove(struct request_queue *q) @@ -549,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (cmpxchg(&q->blk_trace, NULL, bt)) goto err; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); ret = 0; err: @@ -1596,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - + put_probe_ref(); blk_trace_free(bt); return 0; } @@ -1629,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q, if (cmpxchg(&q->blk_trace, NULL, bt)) goto free_bt; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); return 0; free_bt: -- cgit v1.2.3 From e10237cc76ef9a4066a84aa2cc710bfd708cc341 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 7 Nov 2017 11:09:50 -0800 Subject: kthread: zero the kthread data structure kthread() could bail out early before we initialize blkcg_css (if the kthread is killed very early. Please see xchg() statement in kthread()), which confuses free_kthread_struct. Instead of moving the blkcg_css initialization early, we simply zero the whole 'self' data structure, which doesn't sound much overhead. Reported-by: syzbot Fixes: 05e3db95ebfc ("kthread: add a mechanism to store cgroup info") Cc: Andrew Morton Cc: Ingo Molnar Cc: Dmitry Vyukov Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/kthread.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index f87cd8b4eb2a..8dbe2454cb1d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -204,7 +204,7 @@ static int kthread(void *_create) struct kthread *self; int ret; - self = kmalloc(sizeof(*self), GFP_KERNEL); + self = kzalloc(sizeof(*self), GFP_KERNEL); set_kthread_struct(self); /* If user was SIGKILLed, I release the structure. */ @@ -220,13 +220,9 @@ static int kthread(void *_create) do_exit(-ENOMEM); } - self->flags = 0; self->data = data; init_completion(&self->exited); init_completion(&self->parked); -#ifdef CONFIG_BLK_CGROUP - self->blkcg_css = NULL; -#endif current->vfork_done = &self->exited; /* OK, tell user we're spawned, wait for stop or wakeup */ -- cgit v1.2.3 From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Nov 2017 15:28:42 -0500 Subject: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Signed-off-by: Josef Bacik Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 +++ kernel/bpf/verifier.c | 2 ++ kernel/events/core.c | 7 +++++++ kernel/trace/Kconfig | 11 +++++++++++ kernel/trace/bpf_trace.c | 35 +++++++++++++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 40 +++++++++++++++++++++++++++++++++------- kernel/trace/trace_probe.h | 6 ++++++ 7 files changed, 97 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8a6c37762330..271daad31f37 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1326,6 +1326,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + if (fp->kprobe_override) + return false; + if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4a942e2e753d..bc464b8ec91e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4357,6 +4357,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index 42d24bd64ea4..ac240d31b5bf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,6 +8171,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 434c840e2d82..9dc0deeaad2b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -518,6 +518,17 @@ config FUNCTION_PROFILER If in doubt, say N. +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on KPROBES_ON_FTRACE + depends on HAVE_KPROBE_OVERRIDE + depends on DYNAMIC_FTRACE_WITH_REGS + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 506efe6e8ed9..1865b0d4cdeb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,10 @@ #include #include #include +#include +#include + +#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -76,6 +80,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + __this_cpu_write(bpf_kprobe_override, 1); + regs_set_return_value(regs, rc); + arch_ftrace_kprobe_override_function(regs); + return 0; +} +#else +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + return -EINVAL; +} +#endif + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -551,6 +578,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; + case BPF_FUNC_override_return: + pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!", + current->comm, task_pid_nr(current)); + return &bpf_override_return_proto; default: return tracing_func_proto(func_id); } @@ -766,6 +797,10 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; + /* Kprobe override only works for ftrace based kprobes. */ + if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event)) + return -EINVAL; + mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index abf92e478cfb..8e3c9ec1faf7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,6 +42,7 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) +DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -87,6 +88,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +int trace_kprobe_ftrace(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + return kprobe_ftrace(&tk->rp.kp); +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1170,7 +1177,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static void +static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1179,12 +1186,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so clear the kprobe and return 1 so that we + * don't do the instruction skipping. Also reset our state so + * we are clean the next pass through. + */ + if (__this_cpu_read(bpf_kprobe_override)) { + __this_cpu_write(bpf_kprobe_override, 0); + reset_current_kprobe(); + return 1; + } + if (!ret) + return 0; + } head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return; + return 0; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1193,13 +1217,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return; + return 0; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL, NULL); + return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1275,6 +1300,7 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1282,9 +1308,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - kprobe_perf_func(tk, regs); + ret = kprobe_perf_func(tk, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return ret; } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 903273c93e61..adbb3f7d1fb5 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -253,6 +253,7 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +int trace_kprobe_ftrace(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -278,6 +279,11 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } + +static inline int trace_kprobe_ftrace(struct trace_event_call *call) +{ + return 0; +} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { -- cgit v1.2.3 From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 11 Nov 2017 18:24:55 +0900 Subject: bpf: Revert bpf_overrid_function() helper changes. NACK'd by x86 maintainer. Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 --- kernel/bpf/verifier.c | 2 -- kernel/events/core.c | 7 ------- kernel/trace/Kconfig | 11 ----------- kernel/trace/bpf_trace.c | 35 ----------------------------------- kernel/trace/trace_kprobe.c | 40 +++++++--------------------------------- kernel/trace/trace_probe.h | 6 ------ 7 files changed, 7 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 271daad31f37..8a6c37762330 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1326,9 +1326,6 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { - if (fp->kprobe_override) - return false; - if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bc464b8ec91e..4a942e2e753d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4357,8 +4357,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); - if (insn->imm == BPF_FUNC_override_return) - prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index ac240d31b5bf..42d24bd64ea4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,13 +8171,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } - /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { - bpf_prog_put(prog); - return -EINVAL; - } - if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9dc0deeaad2b..434c840e2d82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -518,17 +518,6 @@ config FUNCTION_PROFILER If in doubt, say N. -config BPF_KPROBE_OVERRIDE - bool "Enable BPF programs to override a kprobed function" - depends on BPF_EVENTS - depends on KPROBES_ON_FTRACE - depends on HAVE_KPROBE_OVERRIDE - depends on DYNAMIC_FTRACE_WITH_REGS - default n - help - Allows BPF to override the execution of a probed function and - set a different return value. This is used for error injection. - config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1865b0d4cdeb..506efe6e8ed9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,10 +13,6 @@ #include #include #include -#include -#include - -#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -80,29 +76,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) -{ - __this_cpu_write(bpf_kprobe_override, 1); - regs_set_return_value(regs, rc); - arch_ftrace_kprobe_override_function(regs); - return 0; -} -#else -BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) -{ - return -EINVAL; -} -#endif - -static const struct bpf_func_proto bpf_override_return_proto = { - .func = bpf_override_return, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, -}; - BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -578,10 +551,6 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; - case BPF_FUNC_override_return: - pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!", - current->comm, task_pid_nr(current)); - return &bpf_override_return_proto; default: return tracing_func_proto(func_id); } @@ -797,10 +766,6 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; - /* Kprobe override only works for ftrace based kprobes. */ - if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event)) - return -EINVAL; - mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8e3c9ec1faf7..abf92e478cfb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,7 +42,6 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -88,12 +87,6 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } -int trace_kprobe_ftrace(struct trace_event_call *call) -{ - struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - return kprobe_ftrace(&tk->rp.kp); -} - static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1177,7 +1170,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static int +static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1186,29 +1179,12 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call)) { - int ret; - - ret = trace_call_bpf(call, regs); - - /* - * We need to check and see if we modified the pc of the - * pt_regs, and if so clear the kprobe and return 1 so that we - * don't do the instruction skipping. Also reset our state so - * we are clean the next pass through. - */ - if (__this_cpu_read(bpf_kprobe_override)) { - __this_cpu_write(bpf_kprobe_override, 0); - reset_current_kprobe(); - return 1; - } - if (!ret) - return 0; - } + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) + return; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return 0; + return; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1217,14 +1193,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return 0; + return; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL, NULL); - return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1300,7 +1275,6 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1308,9 +1282,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - ret = kprobe_perf_func(tk, regs); + kprobe_perf_func(tk, regs); #endif - return ret; + return 0; /* We don't tweek kernel, so just return 0 */ } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index adbb3f7d1fb5..903273c93e61 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -253,7 +253,6 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); -int trace_kprobe_ftrace(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -279,11 +278,6 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } - -static inline int trace_kprobe_ftrace(struct trace_event_call *call) -{ - return 0; -} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { -- cgit v1.2.3 From d00a08cf9ee986ad6689ce8c6fd176aff679c106 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 12 Nov 2017 13:02:51 +0100 Subject: irq/work: Use llist_for_each_entry_safe The llist_for_each_entry() loop in irq_work_run_list() is unsafe because once the works PENDING bit is cleared it can be requeued on another CPU. Use llist_for_each_entry_safe() instead. Fixes: 16c0890dc66d ("irq/work: Don't reinvent the wheel but use existing llist API") Reported-by:Chris Wilson Signed-off-by: Thomas Gleixner Cc: Frederic Weisbecker Cc: Byungchul Park Cc: Peter Zijlstra Cc: Petri Latvala Link: http://lkml.kernel.org/r/151027307351.14762.4611888896020658384@mail.alporthouse.com --- kernel/irq_work.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e2ebe8c71e8f..6647b33f7eb0 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -128,9 +128,9 @@ bool irq_work_needs_cpu(void) static void irq_work_run_list(struct llist_head *list) { - unsigned long flags; - struct irq_work *work; + struct irq_work *work, *tmp; struct llist_node *llnode; + unsigned long flags; BUG_ON(!irqs_disabled()); @@ -138,7 +138,7 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry(work, llnode, llnode) { + llist_for_each_entry_safe(work, tmp, llnode, llnode) { /* * Clear the PENDING bit, after this point the @work * can be re-used. -- cgit v1.2.3 From df27067e6040b51188184876253d93da002433aa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 10 Nov 2017 16:25:04 +0100 Subject: pstore: Use ktime_get_real_fast_ns() instead of __getnstimeofday() __getnstimeofday() is a rather odd interface, with a number of quirks: - The caller may come from NMI context, but the implementation is not NMI safe, one way to get there from NMI is NMI handler: something bad panic() kmsg_dump() pstore_dump() pstore_record_init() __getnstimeofday() - The calling conventions are different from any other timekeeping functions, to deal with returning an error code during suspended timekeeping. Address the above issues by using a completely different method to get the time: ktime_get_real_fast_ns() is NMI safe and has a reasonable behavior when timekeeping is suspended: it returns the time at which it got suspended. As Thomas Gleixner explained, this is safe, as ktime_get_real_fast_ns() does not call into the clocksource driver that might be suspended. The result can easily be transformed into a timespec structure. Since ktime_get_real_fast_ns() was not exported to modules, add the export. The pstore behavior for the suspended case changes slightly, as it now stores the timestamp at which timekeeping was suspended instead of storing a zero timestamp. This change is not addressing y2038-safety, that's subject to a more complex follow up patch. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Cc: Tony Luck Cc: Anton Vorontsov Cc: Stephen Boyd Cc: John Stultz Cc: Colin Cross Link: https://lkml.kernel.org/r/20171110152530.1926955-1-arnd@arndb.de --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 353f7bd1eeb0..198afa78bf69 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -528,6 +528,7 @@ u64 ktime_get_real_fast_ns(void) { return __ktime_get_real_fast_ns(&tk_fast_mono); } +EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. -- cgit v1.2.3 From b24591e2fcf852ad7ad2ccf745c8220bf378d312 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 9 Nov 2017 12:35:07 +0000 Subject: timers: Add a function to start/reduce a timer Add a function, similar to mod_timer(), that will start a timer if it isn't running and will modify it if it is running and has an expiry time longer than the new time. If the timer is running with an expiry time that's the same or sooner, no change is made. The function looks like: int timer_reduce(struct timer_list *timer, unsigned long expires); This can be used by code such as networking code to make it easier to share a timer for multiple timeouts. For instance, in upcoming AF_RXRPC code, the rxrpc_call struct will maintain a number of timeouts: unsigned long ack_at; unsigned long resend_at; unsigned long ping_at; unsigned long expect_rx_by; unsigned long expect_req_by; unsigned long expect_term_by; each of which is set independently of the others. With timer reduction available, when the code needs to set one of the timeouts, it only needs to look at that timeout and then call timer_reduce() to modify the timer, starting it or bringing it forward if necessary. There is no need to refer to the other timeouts to see which is earliest and no need to take any lock other than, potentially, the timer lock inside timer_reduce(). Note, that this does not protect against concurrent invocations of any of the timer functions. As an example, the expect_rx_by timeout above, which terminates a call if we don't get a packet from the server within a certain time window, would be set something like this: unsigned long now = jiffies; unsigned long expect_rx_by = now + packet_receive_timeout; WRITE_ONCE(call->expect_rx_by, expect_rx_by); timer_reduce(&call->timer, expect_rx_by); The timer service code (which might, say, be in a work function) would then check all the timeouts to see which, if any, had triggered, deal with those: t = READ_ONCE(call->ack_at); if (time_after_eq(now, t)) { cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET); set_bit(RXRPC_CALL_EV_ACK, &call->events); } and then restart the timer if necessary by finding the soonest timeout that hasn't yet passed and then calling timer_reduce(). The disadvantage of doing things this way rather than comparing the timers each time and calling mod_timer() is that you *will* take timer events unless you can finish what you're doing and delete the timer in time. The advantage of doing things this way is that you don't need to use a lock to work out when the next timer should be set, other than the timer's own lock - which you might not have to take. [ tglx: Fixed weird formatting and adopted it to pending changes ] Signed-off-by: David Howells Signed-off-by: Thomas Gleixner Cc: keyrings@vger.kernel.org Cc: linux-afs@lists.infradead.org Link: https://lkml.kernel.org/r/151023090769.23050.1801643667223880753.stgit@warthog.procyon.org.uk --- kernel/time/timer.c | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index fbb1f85327bf..af0b8bae4502 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -929,8 +929,11 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, } } +#define MOD_TIMER_PENDING_ONLY 0x01 +#define MOD_TIMER_REDUCE 0x02 + static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) +__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; @@ -950,7 +953,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * larger granularity than you would get from adding a new * timer with this expiry. */ - if (timer->expires == expires) + long diff = timer->expires - expires; + + if (!diff) + return 1; + if (options & MOD_TIMER_REDUCE && diff <= 0) return 1; /* @@ -962,6 +969,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) base = lock_timer_base(timer, &flags); forward_timer_base(base); + if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && + time_before_eq(timer->expires, expires)) { + ret = 1; + goto out_unlock; + } + clk = base->clk; idx = calc_wheel_index(expires, clk); @@ -971,7 +984,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * subsequent call will exit in the expires check above. */ if (idx == timer_get_idx(timer)) { - timer->expires = expires; + if (!(options & MOD_TIMER_REDUCE)) + timer->expires = expires; + else if (time_after(timer->expires, expires)) + timer->expires = expires; ret = 1; goto out_unlock; } @@ -981,7 +997,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } ret = detach_if_pending(timer, base, false); - if (!ret && pending_only) + if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; debug_activate(timer, expires); @@ -1042,7 +1058,7 @@ out_unlock: */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, true); + return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); } EXPORT_SYMBOL(mod_timer_pending); @@ -1068,10 +1084,25 @@ EXPORT_SYMBOL(mod_timer_pending); */ int mod_timer(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, false); + return __mod_timer(timer, expires, 0); } EXPORT_SYMBOL(mod_timer); +/** + * timer_reduce - Modify a timer's timeout if it would reduce the timeout + * @timer: The timer to be modified + * @expires: New timeout in jiffies + * + * timer_reduce() is very similar to mod_timer(), except that it will only + * modify a running timer if that would reduce the expiration time (it will + * start a timer that isn't running). + */ +int timer_reduce(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, MOD_TIMER_REDUCE); +} +EXPORT_SYMBOL(timer_reduce); + /** * add_timer - start a timer * @timer: the timer to be added @@ -1754,7 +1785,7 @@ signed long __sched schedule_timeout(signed long timeout) timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, false); + __mod_timer(&timer.timer, expire, 0); schedule(); del_singleshot_timer_sync(&timer.timer); -- cgit v1.2.3 From 6714796edcce27f7a1845e2f79783cd51bb4799b Mon Sep 17 00:00:00 2001 From: Wen Yaxng Date: Wed, 8 Nov 2017 09:55:03 +0800 Subject: genirq/proc: Return proper error code when irq_set_affinity() fails write_irq_affinity() returns the number of written bytes, which means success, unconditionally whether the actual irq_set_affinity() call succeeded or not. Add proper error handling and pass the error code returned from irq_set_affinity() back to user space in case of failure. [ tglx: Fixed coding style and massaged changelog ] Signed-off-by: Wen Yang Signed-off-by: Thomas Gleixner Reviewed-by: Jiang Biao Cc: zhong.weidong@zte.com.cn Link: https://lkml.kernel.org/r/1510106103-184761-1-git-send-email-wen.yang99@zte.com.cn --- kernel/irq/proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6376b4a598d3..29d6f520a9fb 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -154,8 +154,9 @@ static ssize_t write_irq_affinity(int type, struct file *file, */ err = irq_select_affinity_usr(irq) ? -EINVAL : count; } else { - irq_set_affinity(irq, new_value); - err = count; + err = irq_set_affinity(irq, new_value); + if (!err) + err = count; } free_cpumask: -- cgit v1.2.3 From 306eb5a38dfc1a4c8a5c910c9844da6a97f2b9a4 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Sun, 12 Nov 2017 22:29:03 +0100 Subject: irqdomain: Drop pointless NULL check in virq_debug_show_one data has been already derefenced unconditionally, so it's pointless to do a NULL pointer check on it afterwards. Drop it. [ tglx: Depersonify changelog. ] Signed-off-by: Rasmus Villemoes Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Link: https://lkml.kernel.org/r/20171112212904.28574-1-linux@rasmusvillemoes.dk --- kernel/irq/irqdomain.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index fbbf34293b17..4f4f60015e8a 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -921,8 +921,7 @@ static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) chip = irq_data_get_irq_chip(data); seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - seq_printf(m, data ? "0x%p " : " %p ", - irq_data_get_irq_chip_data(data)); + seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data)); seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); -- cgit v1.2.3 From ffc661c99f621152d5fdcf53f9df0d48c326318b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 30 Oct 2017 22:35:47 +0100 Subject: genirq: Fix type of shifting literal 1 in __setup_irq() If ffz() ever returns a value >= 31 then the following shift is undefined behaviour because the literal 1 which gets shifted is treated as signed integer. In practice, the bug is probably harmless, since the first undefined shift count is 31 which results - ignoring UB - in (int)(0x80000000). This gets sign extended so bit 32-63 will be set as well and all subsequent __setup_irq() calls would just end up hitting the -EBUSY branch. However, a sufficiently aggressive optimizer may use the UB of 1<<31 to decide that doesn't happen, and hence elide the sign-extension code, so that subsequent calls can indeed get ffz > 31. In any case, the right thing to do is to make the literal 1UL. [ tglx: For this to happen a single interrupt would have to be shared by 32 devices. Hardware like that does not exist and would have way more problems than that. ] Signed-off-by: Rasmus Villemoes Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20171030213548.16831-1-linux@rasmusvillemoes.dk --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c65f282cbc9a..6a1bf9dc7a6a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1289,7 +1289,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * thread_mask assigned. See the loop above which or's * all existing action->thread_mask bits. */ - new->thread_mask = 1 << ffz(thread_mask); + new->thread_mask = 1UL << ffz(thread_mask); } else if (new->handler == irq_default_primary_handler && !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { -- cgit v1.2.3 From 277642dcca765a1955d4c753a5a315ff7f2eb09d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 12 Nov 2017 17:00:53 -0800 Subject: modules: make sysfs attribute files readable by owner only This code goes back to the historical bitkeeper tree commit 3f7b0672086 ("Module section offsets in /sys/module"), where Jonathan Corbet wanted to show people how to debug loadable modules. See https://lwn.net/Articles/88052/ from June 2004. To expose the required load address information, Jonathan added the sections subdirectory for every module in /sys/modules, and made them S_IRUGO - readable by everybody. It was a more innocent time, plus those S_IRxxx macro names are a lot more confusing than the octal numbers are, so maybe it wasn't even intentional. But here we are, thirteen years later, and I'll just change it to S_IRUSR instead. Let's see if anybody even notices. Cc: Jonathan Corbet Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..fdb3a6aca363 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1516,7 +1516,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) sattr->mattr.show = module_sect_show; sattr->mattr.store = NULL; sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.mode = S_IRUGO; + sattr->mattr.attr.mode = S_IRUSR; *(gattr++) = &(sattr++)->mattr.attr; } *gattr = NULL; -- cgit v1.2.3 From 516fb7f2e73dcc303fb97fc3593209fcacf2d982 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 12 Nov 2017 18:44:23 -0800 Subject: /proc/module: use the same logic as /proc/kallsyms for address exposure The (alleged) users of the module addresses are the same: kernel profiling. So just expose the same helper and format macros, and unify the logic. Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 8 +------- kernel/module.c | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 51b49ed452e4..1e6ae66c6244 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -581,12 +581,6 @@ static void s_stop(struct seq_file *m, void *p) { } -#ifndef CONFIG_64BIT -# define KALLSYM_FMT "%08lx" -#else -# define KALLSYM_FMT "%016lx" -#endif - static int s_show(struct seq_file *m, void *p) { unsigned long value; @@ -640,7 +634,7 @@ static inline int kallsyms_for_perf(void) * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ -static int kallsyms_show_value(void) +int kallsyms_show_value(void) { switch (kptr_restrict) { case 0: diff --git a/kernel/module.c b/kernel/module.c index fdb3a6aca363..0122747ba150 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4147,6 +4147,7 @@ static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); char buf[MODULE_FLAGS_BUF_SIZE]; + unsigned long value; /* We always ignore unformed modules. */ if (mod->state == MODULE_STATE_UNFORMED) @@ -4162,7 +4163,8 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%pK", mod->core_layout.base); + value = m->private ? 0 : (unsigned long)mod->core_layout.base; + seq_printf(m, " 0x" KALLSYM_FMT, value); /* Taints info */ if (mod->taints) @@ -4184,9 +4186,23 @@ static const struct seq_operations modules_op = { .show = m_show }; +/* + * This also sets the "private" pointer to non-NULL if the + * kernel pointers should be hidden (so you can just test + * "m->private" to see if you should keep the values private). + * + * We use the same logic as for /proc/kallsyms. + */ static int modules_open(struct inode *inode, struct file *file) { - return seq_open(file, &modules_op); + int err = seq_open(file, &modules_op); + + if (!err) { + struct seq_file *m = file->private_data; + m->private = kallsyms_show_value() ? NULL : (void *)8ul; + } + + return 0; } static const struct file_operations proc_modules_operations = { -- cgit v1.2.3 From 8e7df2b5b7f245c9bd11064712db5cb69044a362 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 13 Nov 2017 07:15:41 +0100 Subject: timer/debug: Change /proc/timer_list from 0444 to 0400 While it uses %pK, there's still few reasons to read this file as non-root. Suggested-by: Linus Torvalds Acked-by: Thomas Gleixner Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/timer_list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 0e7f5428a148..0ed768b56c60 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -389,7 +389,7 @@ static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; - pe = proc_create("timer_list", 0444, NULL, &timer_list_fops); + pe = proc_create("timer_list", 0400, NULL, &timer_list_fops); if (!pe) return -ENOMEM; return 0; -- cgit v1.2.3 From 5e4def20381678ba3ce0a4e117f97e378ecd81bc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Nov 2017 15:27:44 +0000 Subject: Pass mode to wait_on_atomic_t() action funcs and provide default actions Make wait_on_atomic_t() pass the TASK_* mode onto its action function as an extra argument and make it 'unsigned int throughout. Also, consolidate a bunch of identical action functions into a default function that can do the appropriate thing for the mode. Also, change the argument name in the bit_wait*() function declarations to reflect the fact that it's the mode and not the bit number. [Peter Z gives this a grudging ACK, but thinks that the whole atomic_t wait should be done differently, though he's not immediately sure as to how] Signed-off-by: David Howells Acked-by: Peter Zijlstra cc: Ingo Molnar --- kernel/sched/wait_bit.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index f8159698aa4d..84cb3acd9260 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -183,7 +183,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo */ static __sched int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - int (*action)(atomic_t *), unsigned mode) + wait_atomic_t_action_f action, unsigned int mode) { atomic_t *val; int ret = 0; @@ -193,7 +193,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en val = wbq_entry->key.flags; if (atomic_read(val) == 0) break; - ret = (*action)(val); + ret = (*action)(val, mode); } while (!ret && atomic_read(val) != 0); finish_wait(wq_head, &wbq_entry->wq_entry); return ret; @@ -210,8 +210,9 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en }, \ } -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), - unsigned mode) +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, + wait_atomic_t_action_f action, + unsigned int mode) { struct wait_queue_head *wq_head = atomic_t_waitqueue(p); DEFINE_WAIT_ATOMIC_T(wq_entry, p); @@ -220,6 +221,15 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), } EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); +__sched int atomic_t_wait(atomic_t *counter, unsigned int mode) +{ + schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(atomic_t_wait); + /** * wake_up_atomic_t - Wake up a waiter on a atomic_t * @p: The atomic_t being waited on, a kernel virtual address -- cgit v1.2.3 From 9fd29c08e52023252f0480ab8f6906a1ecc9a8d5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 12 Nov 2017 14:49:09 -0800 Subject: bpf: improve verifier ARG_CONST_SIZE_OR_ZERO semantics For helpers, the argument type ARG_CONST_SIZE_OR_ZERO permits the access size to be 0 when accessing the previous argument (arg). Right now, it requires the arg needs to be NULL when size passed is 0 or could be 0. It also requires a non-NULL arg when the size is proved to be non-0. This patch changes verifier ARG_CONST_SIZE_OR_ZERO behavior such that for size-0 or possible size-0, it is not required the arg equal to NULL. There are a couple of reasons for this semantics change, and all of them intends to simplify user bpf programs which may improve user experience and/or increase chances of verifier acceptance. Together with the next patch which changes bpf_probe_read arg2 type from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO, the following two examples, which fail the verifier currently, are able to get verifier acceptance. Example 1: unsigned long len = pend - pstart; len = len > MAX_PAYLOAD_LEN ? MAX_PAYLOAD_LEN : len; len &= MAX_PAYLOAD_LEN; bpf_probe_read(data->payload, len, pstart); It does not have test for "len > 0" and it failed the verifier. Users may not be aware that they have to add this test. Converting the bpf_probe_read helper to have ARG_CONST_SIZE_OR_ZERO helps the above code get verifier acceptance. Example 2: Here is one example where llvm "messed up" the code and the verifier fails. ...... unsigned long len = pend - pstart; if (len > 0 && len <= MAX_PAYLOAD_LEN) bpf_probe_read(data->payload, len, pstart); ...... The compiler generates the following code and verifier fails: ...... 39: (79) r2 = *(u64 *)(r10 -16) 40: (1f) r2 -= r8 41: (bf) r1 = r2 42: (07) r1 += -1 43: (25) if r1 > 0xffe goto pc+3 R0=inv(id=0) R1=inv(id=0,umax_value=4094,var_off=(0x0; 0xfff)) R2=inv(id=0) R6=map_value(id=0,off=0,ks=4,vs=4095,imm=0) R7=inv(id=0) R8=inv(id=0) R9=inv0 R10=fp0 44: (bf) r1 = r6 45: (bf) r3 = r8 46: (85) call bpf_probe_read#45 R2 min value is negative, either use unsigned or 'var &= const' ...... The compiler optimization is correct. If r1 = 0, r1 - 1 = 0xffffffffffffffff > 0xffe. If r1 != 0, r1 - 1 will not wrap. r1 > 0xffe at insn #43 can actually capture both "r1 > 0" and "len <= MAX_PAYLOAD_LEN". This however causes an issue in verifier as the value range of arg2 "r2" does not properly get refined and lead to verification failure. Relaxing bpf_prog_read arg2 from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO allows the following simplied code: unsigned long len = pend - pstart; if (len <= MAX_PAYLOAD_LEN) bpf_probe_read(data->payload, len, pstart); The llvm compiler will generate less complex code and the verifier is able to verify that the program is okay. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4a942e2e753d..dd54d20ace2f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -799,12 +799,13 @@ static int check_stack_read(struct bpf_verifier_env *env, /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size) + int size, bool zero_size_allowed) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_map *map = regs[regno].map_ptr; - if (off < 0 || size <= 0 || off + size > map->value_size) { + if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || + off + size > map->value_size) { verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; @@ -814,7 +815,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size, bool zero_size_allowed) { struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; @@ -837,7 +838,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size); + err = __check_map_access(env, regno, reg->smin_value + off, size, + zero_size_allowed); if (err) { verbose(env, "R%d min value is outside of the array range\n", regno); @@ -853,7 +855,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size); + err = __check_map_access(env, regno, reg->umax_value + off, size, + zero_size_allowed); if (err) verbose(env, "R%d max value is outside of the array range\n", regno); @@ -889,12 +892,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size, bool zero_size_allowed) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - if (off < 0 || size <= 0 || (u64)off + size > reg->range) { + if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || + (u64)off + size > reg->range) { verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; @@ -903,7 +907,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, } static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, - int size) + int size, bool zero_size_allowed) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; @@ -922,7 +926,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size); + err = __check_packet_access(env, regno, off, size, zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; @@ -1097,7 +1101,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_map_access(env, regno, off, size); + err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); @@ -1184,7 +1188,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); return -EACCES; } - err = check_packet_access(env, regno, off, size); + err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { @@ -1281,7 +1285,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size <= 0) { + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { verbose(env, "invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; @@ -1319,9 +1323,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, reg->off, access_size); + return check_packet_access(env, regno, reg->off, access_size, + zero_size_allowed); case PTR_TO_MAP_VALUE: - return check_map_access(env, regno, reg->off, access_size); + return check_map_access(env, regno, reg->off, access_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -1415,7 +1421,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, - meta->map_ptr->key_size); + meta->map_ptr->key_size, + false); else err = check_stack_boundary(env, regno, meta->map_ptr->key_size, @@ -1431,7 +1438,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, - meta->map_ptr->value_size); + meta->map_ptr->value_size, + false); else err = check_stack_boundary(env, regno, meta->map_ptr->value_size, -- cgit v1.2.3 From 9c019e2bc4b2bd8223c8c0d4b6962478b479834d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 12 Nov 2017 14:49:10 -0800 Subject: bpf: change helper bpf_probe_read arg2 type to ARG_CONST_SIZE_OR_ZERO The helper bpf_probe_read arg2 type is changed from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO to permit size-0 buffer. Together with newer ARG_CONST_SIZE_OR_ZERO semantics which allows non-NULL buffer with size 0, this allows simpler bpf programs with verifier acceptance. The previous commit which changes ARG_CONST_SIZE_OR_ZERO semantics has details on examples. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 506efe6e8ed9..a5580c670866 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -78,12 +78,16 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { - int ret; + int ret = 0; + + if (unlikely(size == 0)) + goto out; ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); + out: return ret; } @@ -92,7 +96,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; -- cgit v1.2.3 From 92ee46efeb505ead3ab06d3c5ce695637ed5f152 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 13 Nov 2017 16:48:47 -0500 Subject: jump_label: Invoke jump_label_test() via early_initcall() Fengguang Wu reported that running the rcuperf test during boot can cause the jump_label_test() to hit a WARN_ON(). The issue is that the core jump label code relies on kernel_text_address() to detect when it can no longer update branches that may be contained in __init sections. The kernel_text_address() in turn assumes that if the system_state variable is greter than or equal to SYSTEM_RUNNING then __init sections are no longer valid (since the assumption is that they have been freed). However, when rcuperf is setup to run in early boot it can call kernel_power_off() which sets the system_state to SYSTEM_POWER_OFF. Since rcuperf initialization is invoked via a module_init(), we can make the dependency of jump_label_test() needing to complete before rcuperf explicit by calling it via early_initcall(). Reported-by: Fengguang Wu Signed-off-by: Jason Baron Acked-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1510609727-2238-1-git-send-email-jbaron@akamai.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 8ff4ca4665ff..8594d24e4adc 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -769,7 +769,7 @@ static __init int jump_label_test(void) return 0; } -late_initcall(jump_label_test); +early_initcall(jump_label_test); #endif /* STATIC_KEYS_SELFTEST */ #endif /* HAVE_JUMP_LABEL */ -- cgit v1.2.3 From aea3706cfc4d952ed6d32b6d5845b5ecd99ed7f5 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Mon, 13 Nov 2017 14:51:31 -0800 Subject: timekeeping: Remove CONFIG_GENERIC_TIME_VSYSCALL_OLD As of d4d1fc61eb38f (ia64: Update fsyscall gettime to use modern vsyscall_update)the last user of CONFIG_GENERIC_TIME_VSYSCALL_OLD have been updated, the legacy support for old-style vsyscall implementations can be removed from the timekeeping code. (Thanks again to Tony Luck for helping remove the last user!) [jstultz: Commit message rework] Signed-off-by: Miroslav Lichvar Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Tony Luck Cc: Richard Cochran Cc: Stephen Boyd Link: https://lkml.kernel.org/r/1510613491-16695-1-git-send-email-john.stultz@linaro.org --- kernel/time/Kconfig | 4 ---- kernel/time/timekeeping.c | 45 --------------------------------------------- 2 files changed, 49 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index d689a9557e17..e776fc8cc1df 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -21,10 +21,6 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE config GENERIC_TIME_VSYSCALL bool -# Timekeeping vsyscall support -config GENERIC_TIME_VSYSCALL_OLD - bool - # Old style timekeeping config ARCH_USES_GETTIMEOFFSET bool diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 198afa78bf69..cd03317e7b57 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -557,45 +557,6 @@ static void halt_fast_timekeeper(struct timekeeper *tk) update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } -#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD -#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon. - -static inline void update_vsyscall(struct timekeeper *tk) -{ - struct timespec xt, wm; - - xt = timespec64_to_timespec(tk_xtime(tk)); - wm = timespec64_to_timespec(tk->wall_to_monotonic); - update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult, - tk->tkr_mono.cycle_last); -} - -static inline void old_vsyscall_fixup(struct timekeeper *tk) -{ - s64 remainder; - - /* - * Store only full nanoseconds into xtime_nsec after rounding - * it up and add the remainder to the error difference. - * XXX - This is necessary to avoid small 1ns inconsistnecies caused - * by truncating the remainder in vsyscalls. However, it causes - * additional work to be done in timekeeping_adjust(). Once - * the vsyscall implementations are converted to use xtime_nsec - * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD - * users are removed, this can be killed. - */ - remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); - if (remainder != 0) { - tk->tkr_mono.xtime_nsec -= remainder; - tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; - tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; - } -} -#else -#define old_vsyscall_fixup(tk) -#endif - static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) @@ -2163,12 +2124,6 @@ void update_wall_time(void) /* correct the clock when NTP error is too big */ timekeeping_adjust(tk, offset); - /* - * XXX This can be killed once everyone converts - * to the new update_vsyscall. - */ - old_vsyscall_fixup(tk); - /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC -- cgit v1.2.3 From 4a31b424ac0656d1bb17520ee861144fe7a19664 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 15 Nov 2017 08:47:02 +0300 Subject: perf/core: Fix memory leak triggered by perf --namespace perf with --namespace key leaks various memory objects including namespaces 4.14.0+ pid_namespace 1 12 2568 12 8 user_namespace 1 39 824 39 8 net_namespace 1 5 6272 5 8 This happen because perf_fill_ns_link_info() struct patch ns_path: during initialization ns_path incremented counters on related mnt and dentry, but without lost path_put nobody decremented them back. Leaked dentry is name of related namespace, and its leak does not allow to free unused namespace. Signed-off-by: Vasily Averin Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Hari Bathini Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Thomas Gleixner Fixes: commit e422267322cd ("perf: Add PERF_RECORD_NAMESPACES to include namespaces related info") Link: http://lkml.kernel.org/r/c510711b-3904-e5e1-d296-61273d21118d@virtuozzo.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 10cdb9c26b5d..ab5ac84f82e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6756,6 +6756,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; + path_put(&ns_path); } } -- cgit v1.2.3 From 89ad2fa3f043a1e8daae193bcb5fe34d5f8caf28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Nov 2017 17:15:50 -0800 Subject: bpf: fix lockdep splat pcpu_freelist_pop() needs the same lockdep awareness than pcpu_freelist_populate() to avoid a false positive. [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ] switchto-defaul/12508 [HC0[0]:SC0[6]:HE0:SE0] is trying to acquire: (&htab->buckets[i].lock){......}, at: [] __htab_percpu_map_update_elem+0x1cb/0x300 and this task is already holding: (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...}, at: [] __dev_queue_xmit+0 x868/0x1240 which would create a new lock dependency: (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...} -> (&htab->buckets[i].lock){......} but this new dependency connects a SOFTIRQ-irq-safe lock: (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...} ... which became SOFTIRQ-irq-safe at: [] __lock_acquire+0x42b/0x1f10 [] lock_acquire+0xbc/0x1b0 [] _raw_spin_lock+0x38/0x50 [] __dev_queue_xmit+0x868/0x1240 [] dev_queue_xmit+0x10/0x20 [] ip_finish_output2+0x439/0x590 [] ip_finish_output+0x150/0x2f0 [] ip_output+0x7d/0x260 [] ip_local_out+0x5e/0xe0 [] ip_queue_xmit+0x205/0x620 [] tcp_transmit_skb+0x5a8/0xcb0 [] tcp_write_xmit+0x242/0x1070 [] __tcp_push_pending_frames+0x3c/0xf0 [] tcp_rcv_established+0x312/0x700 [] tcp_v4_do_rcv+0x11c/0x200 [] tcp_v4_rcv+0xaa2/0xc30 [] ip_local_deliver_finish+0xa7/0x240 [] ip_local_deliver+0x66/0x200 [] ip_rcv_finish+0xdd/0x560 [] ip_rcv+0x295/0x510 [] __netif_receive_skb_core+0x988/0x1020 [] __netif_receive_skb+0x21/0x70 [] process_backlog+0x6f/0x230 [] net_rx_action+0x229/0x420 [] __do_softirq+0xd8/0x43d [] do_softirq_own_stack+0x1c/0x30 [] do_softirq+0x55/0x60 [] __local_bh_enable_ip+0xa8/0xb0 [] cpu_startup_entry+0x1c7/0x500 [] start_secondary+0x113/0x140 to a SOFTIRQ-irq-unsafe lock: (&head->lock){+.+...} ... which became SOFTIRQ-irq-unsafe at: ... [] __lock_acquire+0x82f/0x1f10 [] lock_acquire+0xbc/0x1b0 [] _raw_spin_lock+0x38/0x50 [] pcpu_freelist_pop+0x7a/0xb0 [] htab_map_alloc+0x50c/0x5f0 [] SyS_bpf+0x265/0x1200 [] entry_SYSCALL_64_fastpath+0x12/0x17 other info that might help us debug this: Chain exists of: dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2 --> &htab->buckets[i].lock --> &head->lock Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&head->lock); local_irq_disable(); lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2); lock(&htab->buckets[i].lock); lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2); *** DEADLOCK *** Fixes: e19494edab82 ("bpf: introduce percpu_freelist") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- kernel/bpf/percpu_freelist.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 5c51d1985b51..673fa6fe2d73 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; + unsigned long flags; int orig_cpu, cpu; + local_irq_save(flags); orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); @@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) node = head->first; if (node) { head->first = node->next; - raw_spin_unlock(&head->lock); + raw_spin_unlock_irqrestore(&head->lock, flags); return node; } raw_spin_unlock(&head->lock); cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; - if (cpu == orig_cpu) + if (cpu == orig_cpu) { + local_irq_restore(flags); return NULL; + } } } -- cgit v1.2.3 From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:33 -0800 Subject: mm: account pud page tables On a machine with 5-level paging support a process can allocate significant amount of memory and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PUD page tables. We don't account PUD page tables, only PMD and PTE. We already addressed the same issue for PMD page tables, see commit dc6c9a35b66b ("mm: account pmd page tables to the process"). Introduction of 5-level paging brings the same issue for PUD page tables. The patch expands accounting to PUD level. [kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/] Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com [heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting] Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Heiko Carstens Acked-by: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 07cc743698d3..a4eb6f289365 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); mm_nr_pmds_init(mm); + mm_nr_puds_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm) if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); + if (mm_nr_puds(mm)) + pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", + mm_nr_puds(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -- cgit v1.2.3 From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:37 -0800 Subject: mm: introduce wrappers to access mm->nr_ptes Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd and nr_pud. The patch also makes nr_ptes accounting dependent onto CONFIG_MMU. Page table accounting doesn't make sense if you don't have page tables. It's preparation for consolidation of page-table counters in mm_struct. Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a4eb6f289365..946922a30ede 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,7 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - atomic_long_set(&mm->nr_ptes, 0); + mm_nr_ptes_init(mm); mm_nr_pmds_init(mm); mm_nr_puds_init(mm); mm->map_count = 0; @@ -873,9 +873,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (atomic_long_read(&mm->nr_ptes)) + if (mm_nr_ptes(mm)) pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - atomic_long_read(&mm->nr_ptes)); + mm_nr_ptes(mm)); if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); -- cgit v1.2.3 From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:40 -0800 Subject: mm: consolidate page table accounting Currently, we account page tables separately for each page table level, but that's redundant -- we only make use of total memory allocated to page tables for oom_badness calculation. We also provide the information to userspace, but it has dubious value there too. This patch switches page table accounting to single counter. mm->pgtables_bytes is now used to account all page table levels. We use bytes, because page table size for different levels of page table tree may be different. The change has user-visible effect: we don't have VmPMD and VmPUD reported in /proc/[pid]/status. Not sure if anybody uses them. (As alternative, we can always report 0 kB for them.) OOM-killer report is also slightly changed: we now report pgtables_bytes instead of nr_ptes, nr_pmd, nr_puds. Apart from reducing number of counters per-mm, the benefit is that we now calculate oom_badness() more correctly for machines which have different size of page tables depending on level or where page tables are less than a page in size. The only downside can be debuggability because we do not know which page table level could leak. But I do not remember many bugs that would be caught by separate counters so I wouldn't lose sleep over this. [akpm@linux-foundation.org: fix mm/huge_memory.c] Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko [kirill.shutemov@linux.intel.com: fix build] Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 946922a30ede..006dc5899a1a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,9 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - mm_nr_ptes_init(mm); - mm_nr_pmds_init(mm); - mm_nr_puds_init(mm); + mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -873,15 +871,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (mm_nr_ptes(mm)) - pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - mm_nr_ptes(mm)); - if (mm_nr_pmds(mm)) - pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", - mm_nr_pmds(mm)); - if (mm_nr_puds(mm)) - pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", - mm_nr_puds(mm)); + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -- cgit v1.2.3 From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:51 -0800 Subject: kmemcheck: remove annotations Patch series "kmemcheck: kill kmemcheck", v2. As discussed at LSF/MM, kill kmemcheck. KASan is a replacement that is able to work without the limitation of kmemcheck (single CPU, slow). KASan is already upstream. We are also not aware of any users of kmemcheck (or users who don't consider KASan as a suitable replacement). The only objection was that since KASAN wasn't supported by all GCC versions provided by distros at that time we should hold off for 2 years, and try again. Now that 2 years have passed, and all distros provide gcc that supports KASAN, kill kmemcheck again for the very same reasons. This patch (of 4): Remove kmemcheck annotations, and calls to kmemcheck from the kernel. [alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs] Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/bpf/core.c | 6 ------ kernel/locking/lockdep.c | 3 --- kernel/trace/ring_buffer.c | 3 --- 3 files changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7b62df86be1d..11ad089f2c74 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - kmemcheck_annotate_bitfield(fp, meta); - aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { - kmemcheck_annotate_bitfield(fp, meta); - memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; @@ -662,8 +658,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); if (fp != NULL) { - kmemcheck_annotate_bitfield(fp, meta); - /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, * this still needs to be adapted. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index db933d063bfc..9776da8db180 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, { int i; - kmemcheck_mark_initialized(lock, sizeof(*lock)); - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) lock->class_cache[i] = NULL; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 845f3805c73d..d57fede84b38 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -13,7 +13,6 @@ #include #include #include /* for self test */ -#include #include #include #include @@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); /* account for padding bytes */ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); @@ -2686,7 +2684,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); -- cgit v1.2.3 From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:54 -0800 Subject: kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK Convert all allocations that used a NOTRACK flag to stop using it. Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 12 ++++++------ kernel/signal.c | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 006dc5899a1a..4e55eedba8d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -469,7 +469,7 @@ void __init fork_init(void) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -2205,18 +2205,18 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); + SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the @@ -2227,7 +2227,7 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); diff --git a/kernel/signal.c b/kernel/signal.c index 8dcd8825b2de..aa1fb9f905db 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, - override_rlimit); + q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { -- cgit v1.2.3 From 4675ff05de2d76d167336b368bd07f3fef6ed5a6 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:36:02 -0800 Subject: kmemcheck: rip it out Fix up makefiles, remove references, and git rm kmemcheck. Link: http://lkml.kernel.org/r/20171007030159.22241-4-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Steven Rostedt Cc: Vegard Nossum Cc: Pekka Enberg Cc: Michal Hocko Cc: Eric W. Biederman Cc: Alexander Potapenko Cc: Tim Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 10 ---------- kernel/sysctl.c | 10 ---------- 2 files changed, 20 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 662f7b1b7a78..2f5e87f1bae2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule); -void __tasklet_hi_schedule_first(struct tasklet_struct *t) -{ - lockdep_assert_irqs_disabled(); - - t->next = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, t); - __raise_softirq_irqoff(HI_SOFTIRQ); -} -EXPORT_SYMBOL(__tasklet_hi_schedule_first); - static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9576bd582d4a..7638e2f7fff8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -1173,15 +1172,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_thousand, }, -#endif -#ifdef CONFIG_KMEMCHECK - { - .procname = "kmemcheck", - .data = &kmemcheck_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif { .procname = "panic_on_warn", -- cgit v1.2.3 From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:38:03 -0800 Subject: mm: remove __GFP_COLD As the page free path makes no distinction between cache hot and cold pages, there is no real useful ordering of pages in the free list that allocation requests can take advantage of. Juding from the users of __GFP_COLD, it is likely that a number of them are the result of copying other sites instead of actually measuring the impact. Remove the __GFP_COLD parameter which simplifies a number of paths in the page allocator. This is potentially controversial but bear in mind that the size of the per-cpu pagelists versus modern cache sizes means that the whole per-cpu list can often fit in the L3 cache. Hence, there is only a potential benefit for microbenchmarks that alloc/free pages in a tight loop. It's even worse when THP is taken into account which has little or no chance of getting a cache-hot page as the per-cpu list is bypassed and the zeroing of multiple pages will thrash the cache anyway. The truncate microbenchmarks are not shown as this patch affects the allocation path and not the free path. A page fault microbenchmark was tested but it showed no sigificant difference which is not surprising given that the __GFP_COLD branches are a miniscule percentage of the fault path. Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a917a301e201..bce0464524d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) */ static inline int get_highmem_buffer(int safe_needed) { - buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + buffer = get_image_page(GFP_ATOMIC, safe_needed); return buffer ? 0 : -ENOMEM; } @@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm, while (nr_pages-- > 0) { struct page *page; - page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_image_page(GFP_ATOMIC); if (!page) goto err_out; memory_bm_set_bit(copy_bm, page_to_pfn(page)); -- cgit v1.2.3 From 4518085e127dff97e74f74a8780d7564e273bec8 Mon Sep 17 00:00:00 2001 From: Kemi Wang Date: Wed, 15 Nov 2017 17:38:22 -0800 Subject: mm, sysctl: make NUMA stats configurable This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. ========================================================================= When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo 0 > /proc/sys/vm/numa_stat In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer (increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench ========================================================================= When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo 1 > /proc/sys/vm/numa_stat This is system default setting. Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka for comments to help improve the original patch. [keescook@chromium.org: make sure mutex is a global static] Link: http://lkml.kernel.org/r/20171107213809.GA4314@beast Link: http://lkml.kernel.org/r/1508290927-8518-1-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang Signed-off-by: Kees Cook Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Luis R . Rodriguez" Cc: Kees Cook Cc: Jonathan Corbet Cc: Mel Gorman Cc: Johannes Weiner Cc: Christopher Lameter Cc: Sebastian Andrzej Siewior Cc: Andrey Ryabinin Cc: Tim Chen Cc: Andi Kleen Cc: Aaron Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7638e2f7fff8..4a13a389e99b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1356,6 +1356,15 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "hugetlb_shm_group", -- cgit v1.2.3 From b1fca27d384e8418aac84b39f6f5179aecc1b64f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 17 Nov 2017 15:27:03 -0800 Subject: kernel debug: support resetting WARN*_ONCE I like _ONCE warnings because it's guaranteed that they don't flood the log. During testing I find it useful to reset the state of the once warnings, so that I can rerun tests and see if they trigger again, or can guarantee that a test run always hits the same warnings. This patch adds a debugfs interface to reset all the _ONCE warnings so that they appear again: echo 1 > /sys/kernel/debug/clear_warn_once This is implemented by putting all the warning booleans into a special section, and clearing it. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20171017221455.6740-1-andi@firstfloor.org Signed-off-by: Andi Kleen Tested-by: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index bdd18afa19a4..672a91dc20fe 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -587,6 +589,32 @@ void warn_slowpath_null(const char *file, int line) EXPORT_SYMBOL(warn_slowpath_null); #endif +#ifdef CONFIG_BUG + +/* Support resetting WARN*_ONCE state */ + +static int clear_warn_once_set(void *data, u64 val) +{ + memset(__start_once, 0, __end_once - __start_once); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, + NULL, + clear_warn_once_set, + "%lld\n"); + +static __init int register_warn_debugfs(void) +{ + /* Don't care about failure */ + debugfs_create_file("clear_warn_once", 0644, NULL, + NULL, &clear_warn_once_fops); + return 0; +} + +device_initcall(register_warn_debugfs); +#endif + #ifdef CONFIG_CC_STACKPROTECTOR /* -- cgit v1.2.3 From aaf5dcfb223617ac2d16113e4b500199c65689de Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 17 Nov 2017 15:27:06 -0800 Subject: kernel debug: support resetting WARN_ONCE for all architectures Some architectures store the WARN_ONCE state in the flags field of the bug_entry. Clear that one too when resetting once state through /sys/kernel/debug/clear_warn_once Pointed out by Michael Ellerman Improves the earlier patch that add clear_warn_once. [ak@linux.intel.com: add a missing ifdef CONFIG_MODULES] Link: http://lkml.kernel.org/r/20171020170633.9593-1-andi@firstfloor.org [akpm@linux-foundation.org: fix unused var warning] [akpm@linux-foundation.org: Use 0200 for clear_warn_once file, per mpe] [akpm@linux-foundation.org: clear BUGFLAG_DONE in clear_once_table(), per mpe] Link: http://lkml.kernel.org/r/20171019204642.7404-1-andi@firstfloor.org Signed-off-by: Andi Kleen Tested-by: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 672a91dc20fe..67cebf2a3b67 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -595,6 +595,7 @@ EXPORT_SYMBOL(warn_slowpath_null); static int clear_warn_once_set(void *data, u64 val) { + generic_bug_clear_once(); memset(__start_once, 0, __end_once - __start_once); return 0; } @@ -607,7 +608,7 @@ DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, static __init int register_warn_debugfs(void) { /* Don't care about failure */ - debugfs_create_file("clear_warn_once", 0644, NULL, + debugfs_create_file("clear_warn_once", 0200, NULL, NULL, &clear_warn_once_fops); return 0; } -- cgit v1.2.3 From 2a8358d8a339540f00ec596526690e8eeca931a3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:27:21 -0800 Subject: bug: define the "cut here" string in a single place The "cut here" string is used in a few paths. Define it in a single place. Link: http://lkml.kernel.org/r/1510100869-73751-3-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Fengguang Wu Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 67cebf2a3b67..89df5fa2f798 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -520,7 +520,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - pr_warn("------------[ cut here ]------------\n"); + pr_warn(CUT_HERE); if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", -- cgit v1.2.3 From a7bed27af194aa3f67915688039d93188ed95e2a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:27:24 -0800 Subject: bug: fix "cut here" location for __WARN_TAINT architectures Prior to v4.11, x86 used warn_slowpath_fmt() for handling WARN()s. After WARN() was moved to using UD0 on x86, the warning text started appearing _before_ the "cut here" line. This appears to have been a long-standing bug on architectures that used __WARN_TAINT, but it didn't get fixed. v4.11 and earlier on x86: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 2956 at drivers/misc/lkdtm_bugs.c:65 lkdtm_WARNING+0x21/0x30 This is a warning message Modules linked in: v4.12 and later on x86: This is a warning message ------------[ cut here ]------------ WARNING: CPU: 1 PID: 2982 at drivers/misc/lkdtm_bugs.c:68 lkdtm_WARNING+0x15/0x20 Modules linked in: With this fix: ------------[ cut here ]------------ This is a warning message WARNING: CPU: 3 PID: 3009 at drivers/misc/lkdtm_bugs.c:67 lkdtm_WARNING+0x15/0x20 Since the __FILE__ reporting happens as part of the UD0 handler, it isn't trivial to move the message to after the WARNING line, but at least we can fix the position of the "cut here" line so all the various logging tools will start including the actual runtime warning message again, when they follow the instruction and "cut here". Link: http://lkml.kernel.org/r/1510100869-73751-4-git-send-email-keescook@chromium.org Fixes: 9a93848fe787 ("x86/debug: Implement __WARN() using UD0") Signed-off-by: Kees Cook Cc: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Fengguang Wu Cc: Arnd Bergmann Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 89df5fa2f798..3242b64b1956 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -520,7 +520,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - pr_warn(CUT_HERE); + if (args) + pr_warn(CUT_HERE); if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", @@ -584,9 +585,22 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint); void warn_slowpath_null(const char *file, int line) { + pr_warn(CUT_HERE); __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); } EXPORT_SYMBOL(warn_slowpath_null); +#else +void __warn_printk(const char *fmt, ...) +{ + va_list args; + + pr_warn(CUT_HERE); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); +} +EXPORT_SYMBOL(__warn_printk); #endif #ifdef CONFIG_BUG -- cgit v1.2.3 From 8c703d660450c4df72ff24f79a335dc7973a9dc8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 17 Nov 2017 15:27:32 -0800 Subject: kernel/umh.c: optimize 'proc_cap_handler()' If 'write' is 0, we can avoid a call to spin_lock/spin_unlock. Link: http://lkml.kernel.org/r/20171020193331.7233-1-christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Acked-by: Luis R. Rodriguez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/umh.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 6ff9905250ff..18e5fa4b0e71 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write, /* * Drop everything not in the new_cap (but don't add things) */ - spin_lock(&umh_sysctl_lock); if (write) { + spin_lock(&umh_sysctl_lock); if (table->data == CAP_BSET) usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); if (table->data == CAP_PI) usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + spin_unlock(&umh_sysctl_lock); } - spin_unlock(&umh_sysctl_lock); return 0; } -- cgit v1.2.3 From 98159d977f71c3b3dee898d1c34e56f520b094e7 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:17 -0800 Subject: pipe: match pipe_max_size data type with procfs Patch series "A few round_pipe_size() and pipe-max-size fixups", v3. While backporting Michael's "pipe: fix limit handling" patchset to a distro-kernel, Mikulas noticed that current upstream pipe limit handling contains a few problems: 1 - procfs signed wrap: echo'ing a large number into /proc/sys/fs/pipe-max-size and then cat'ing it back out shows a negative value. 2 - round_pipe_size() nr_pages overflow on 32bit: this would subsequently try roundup_pow_of_two(0), which is undefined. 3 - visible non-rounded pipe-max-size value: there is no mutual exclusion or protection between the time pipe_max_size is assigned a raw value from proc_dointvec_minmax() and when it is rounded. 4 - unsigned long -> unsigned int conversion makes for potential odd return errors from do_proc_douintvec_minmax_conv() and do_proc_dopipe_max_size_conv(). This version underwent the same testing as v1: https://marc.info/?l=linux-kernel&m=150643571406022&w=2 This patch (of 4): pipe_max_size is defined as an unsigned int: unsigned int pipe_max_size = 1048576; but its procfs/sysctl representation is an integer: static struct ctl_table fs_table[] = { ... { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, ... that is signed: int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { ... ret = proc_dointvec_minmax(table, write, buf, lenp, ppos) This leads to signed results via procfs for large values of pipe_max_size: % echo 2147483647 >/proc/sys/fs/pipe-max-size % cat /proc/sys/fs/pipe-max-size -2147483648 Use unsigned operations on this variable to avoid such negative values. Link: http://lkml.kernel.org/r/1507658689-11669-2-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Al Viro Cc: Jens Axboe Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4a13a389e99b..2d42183b4c98 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1816,7 +1816,7 @@ static struct ctl_table fs_table[] = { { .procname = "pipe-max-size", .data = &pipe_max_size, - .maxlen = sizeof(int), + .maxlen = sizeof(pipe_max_size), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, -- cgit v1.2.3 From 7a8d181949fb2c16be00f8cdb354794a30e46b39 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:24 -0800 Subject: pipe: add proc_dopipe_max_size() to safely assign pipe_max_size pipe_max_size is assigned directly via procfs sysctl: static struct ctl_table fs_table[] = { ... { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, ... int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { ... ret = proc_dointvec_minmax(table, write, buf, lenp, ppos) ... and then later rounded in-place a few statements later: ... pipe_max_size = round_pipe_size(pipe_max_size); ... This leaves a window of time between initial assignment and rounding that may be visible to other threads. (For example, one thread sets a non-rounded value to pipe_max_size while another reads its value.) Similar reads of pipe_max_size are potentially racy: pipe.c :: alloc_pipe_info() pipe.c :: pipe_set_size() Add a new proc_dopipe_max_size() that consolidates reading the new value from the user buffer, verifying bounds, and calling round_pipe_size() with a single assignment to pipe_max_size. Link: http://lkml.kernel.org/r/1507658689-11669-4-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d42183b4c98..138b6484f277 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -2620,6 +2621,47 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, do_proc_douintvec_minmax_conv, ¶m); } +struct do_proc_dopipe_max_size_conv_param { + unsigned int *min; +}; + +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_dopipe_max_size_conv_param *param = data; + + if (write) { + unsigned int val = round_pipe_size(*lvalp); + + if (val == 0) + return -EINVAL; + + if (param->min && *param->min > val) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dopipe_max_size_conv_param param = { + .min = (unsigned int *) table->extra1, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3125,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3168,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL_GPL(proc_dopipe_max_size); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); -- cgit v1.2.3 From fb910c42ccebf853c29296185c45c11164a56098 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:28 -0800 Subject: sysctl: check for UINT_MAX before unsigned int min/max Mikulas noticed in the existing do_proc_douintvec_minmax_conv() and do_proc_dopipe_max_size_conv() introduced in this patchset, that they inconsistently handle overflow and min/max range inputs: For example: 0 ... param->min - 1 ---> ERANGE param->min ... param->max ---> the value is accepted param->max + 1 ... 0x100000000L + param->min - 1 ---> ERANGE 0x100000000L + param->min ... 0x100000000L + param->max ---> EINVAL 0x100000000L + param->max + 1, 0x200000000L + param->min - 1 ---> ERANGE 0x200000000L + param->min ... 0x200000000L + param->max ---> EINVAL 0x200000000L + param->max + 1, 0x300000000L + param->min - 1 ---> ERANGE In do_proc_do*() routines which store values into unsigned int variables (4 bytes wide for 64-bit builds), first validate that the input unsigned long value (8 bytes wide for 64-bit builds) will fit inside the smaller unsigned int variable. Then check that the unsigned int value falls inside the specified parameter min, max range. Otherwise the unsigned long -> unsigned int conversion drops leading bits from the input value, leading to the inconsistent pattern Mikulas documented above. Link: http://lkml.kernel.org/r/1507658689-11669-5-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 138b6484f277..dd25d90896fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2576,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, if (write) { unsigned int val = *lvalp; + if (*lvalp > UINT_MAX) + return -EINVAL; + if ((param->min && *param->min > val) || (param->max && *param->max < val)) return -ERANGE; - if (*lvalp > UINT_MAX) - return -EINVAL; *valp = val; } else { unsigned int val = *valp; @@ -2632,17 +2633,18 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, struct do_proc_dopipe_max_size_conv_param *param = data; if (write) { - unsigned int val = round_pipe_size(*lvalp); + unsigned int val; + if (*lvalp > UINT_MAX) + return -EINVAL; + + val = round_pipe_size(*lvalp); if (val == 0) return -EINVAL; if (param->min && *param->min > val) return -ERANGE; - if (*lvalp > UINT_MAX) - return -EINVAL; - *valp = val; } else { unsigned int val = *valp; -- cgit v1.2.3 From 628c1bcba204052d19b686b5bac149a644cdb72e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 17 Nov 2017 15:30:01 -0800 Subject: kernel/signal.c: protect the traced SIGNAL_UNKILLABLE tasks from SIGKILL The comment in sig_ignored() says "Tracers may want to know about even ignored signals" but SIGKILL can not be reported to debugger and it is just wrong to return 0 in this case: SIGKILL should only kill the SIGNAL_UNKILLABLE task if it comes from the parent ns. Change sig_ignored() to ignore ->ptrace if sig == SIGKILL and rely on sig_task_ignored(). SISGTOP coming from within the namespace is not really right too but at least debugger can intercept it, and we can't drop it here because this will break "gdb -p 1": ptrace_attach() won't work. Perhaps we will add another ->ptrace check later, we will see. Link: http://lkml.kernel.org/r/20171103184206.GB21036@redhat.com Signed-off-by: Oleg Nesterov Tested-by: Kyle Huey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index aa1fb9f905db..be5913134742 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, force)) - return 0; - /* - * Tracers may want to know about even ignored signals. + * Tracers may want to know about even ignored signal unless it + * is SIGKILL which can't be reported anyway but can be ignored + * by SIGNAL_UNKILLABLE task. */ - return !t->ptrace; + if (t->ptrace && sig != SIGKILL) + return 0; + + return sig_task_ignored(t, sig, force); } /* -- cgit v1.2.3 From ac25385089f673560867eb5179228a44ade0cfc1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 17 Nov 2017 15:30:04 -0800 Subject: kernel/signal.c: protect the SIGNAL_UNKILLABLE tasks from !sig_kernel_only() signals Change sig_task_ignored() to drop the SIG_DFL && !sig_kernel_only() signals even if force == T. This simplifies the next change and this matches the same check in get_signal() which will drop these signals anyway. Link: http://lkml.kernel.org/r/20171103184227.GC21036@redhat.com Signed-off-by: Oleg Nesterov Tested-by: Kyle Huey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index be5913134742..01ba166a5e3a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !force) + handler == SIG_DFL && !(force && sig_kernel_only(sig))) return 1; return sig_handler_ignored(handler, sig); -- cgit v1.2.3 From 426915796ccaf9c2bd9bb06dc5702225957bc2e5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 17 Nov 2017 15:30:08 -0800 Subject: kernel/signal.c: remove the no longer needed SIGNAL_UNKILLABLE check in complete_signal() complete_signal() checks SIGNAL_UNKILLABLE before it starts to destroy the thread group, today this is wrong in many ways. If nothing else, fatal_signal_pending() should always imply that the whole thread group (except ->group_exit_task if it is not NULL) is killed, this check breaks the rule. After the previous changes we can rely on sig_task_ignored(); sig_fatal(sig) && SIGNAL_UNKILLABLE can only be true if we actually want to kill this task and sig == SIGKILL OR it is traced and debugger can intercept the signal. This should hopefully fix the problem reported by Dmitry. This test-case static int init(void *arg) { for (;;) pause(); } int main(void) { char stack[16 * 1024]; for (;;) { int pid = clone(init, stack + sizeof(stack)/2, CLONE_NEWPID | SIGCHLD, NULL); assert(pid > 0); assert(ptrace(PTRACE_ATTACH, pid, 0, 0) == 0); assert(waitpid(-1, NULL, WSTOPPED) == pid); assert(ptrace(PTRACE_DETACH, pid, 0, SIGSTOP) == 0); assert(syscall(__NR_tkill, pid, SIGKILL) == 0); assert(pid == wait(NULL)); } } triggers the WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)) in task_participate_group_stop(). do_signal_stop()->signal_group_exit() checks SIGNAL_GROUP_EXIT and return false, but task_set_jobctl_pending() checks fatal_signal_pending() and does not set JOBCTL_STOP_PENDING. And his should fix the minor security problem reported by Kyle, SECCOMP_RET_TRACE can miss fatal_signal_pending() the same way if the task is the root of a pid namespace. Link: http://lkml.kernel.org/r/20171103184246.GD21036@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Dmitry Vyukov Reported-by: Kyle Huey Reviewed-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 01ba166a5e3a..6895f6bb98a7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -931,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group) * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && - !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && + !(signal->flags & SIGNAL_GROUP_EXIT) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !t->ptrace)) { + (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ -- cgit v1.2.3 From de40ccefd1f19180c0a43e4d9b9d2f4dc8856c8b Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 17 Nov 2017 15:30:12 -0800 Subject: kdump: print a message in case parse_crashkernel_mem resulted in zero bytes parse_crashkernel_mem() silently returns if we get zero bytes in the parsing function. It is useful for debugging to add a message, especially if the kernel cannot boot correctly. Add a pr_info instead of pr_warn because it is expected behavior for size = 0, eg. crashkernel=2G-4G:128M, size will be 0 in case system memory is less than 2G. Link: http://lkml.kernel.org/r/20171114080129.GA6115@dhcp-128-65.nay.redhat.com Signed-off-by: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Bhupesh Sharma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 6db80fc0810b..b3663896278e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline, return -EINVAL; } } - } + } else + pr_info("crashkernel size resulted in zero bytes\n"); return 0; } -- cgit v1.2.3 From f9eb2fdd04d4e68fbea18970bbf65ace716d25b6 Mon Sep 17 00:00:00 2001 From: "Ola N. Kaldestad" Date: Fri, 17 Nov 2017 15:30:26 -0800 Subject: kernel/sysctl.c: code cleanups Remove unnecessary else block, remove redundant return and call to kfree in if block. Link: http://lkml.kernel.org/r/1510238435-1655-1-git-send-email-mail@okal.no Signed-off-by: Ola N. Kaldestad Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dd25d90896fc..557d46728577 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3127,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, else bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } - kfree(tmp_bitmap); *lenp -= left; *ppos += *lenp; - return 0; - } else { - kfree(tmp_bitmap); - return err; } + + kfree(tmp_bitmap); + return err; } #else /* CONFIG_PROC_SYSCTL */ -- cgit v1.2.3 From 95846ecf9dac5089aed4b144d912225f8ef86ae4 Mon Sep 17 00:00:00 2001 From: Gargi Sharma Date: Fri, 17 Nov 2017 15:30:30 -0800 Subject: pid: replace pid bitmap implementation with IDR API Patch series "Replacing PID bitmap implementation with IDR API", v4. This series replaces kernel bitmap implementation of PID allocation with IDR API. These patches are written to simplify the kernel by replacing custom code with calls to generic code. The following are the stats for pid and pid_namespace object files before and after the replacement. There is a noteworthy change between the IDR and bitmap implementation. Before text data bss dec hex filename 8447 3894 64 12405 3075 kernel/pid.o After text data bss dec hex filename 3397 304 0 3701 e75 kernel/pid.o Before text data bss dec hex filename 5692 1842 192 7726 1e2e kernel/pid_namespace.o After text data bss dec hex filename 2854 216 16 3086 c0e kernel/pid_namespace.o The following are the stats for ps, pstree and calling readdir on /proc for 10,000 processes. ps: With IDR API With bitmap real 0m1.479s 0m2.319s user 0m0.070s 0m0.060s sys 0m0.289s 0m0.516s pstree: With IDR API With bitmap real 0m1.024s 0m1.794s user 0m0.348s 0m0.612s sys 0m0.184s 0m0.264s proc: With IDR API With bitmap real 0m0.059s 0m0.074s user 0m0.000s 0m0.004s sys 0m0.016s 0m0.016s This patch (of 2): Replace the current bitmap implementation for Process ID allocation. Functions that are no longer required, for example, free_pidmap(), alloc_pidmap(), etc. are removed. The rest of the functions are modified to use the IDR API. The change was made to make the PID allocation less complex by replacing custom code with calls to generic API. [gs051095@gmail.com: v6] Link: http://lkml.kernel.org/r/1507760379-21662-2-git-send-email-gs051095@gmail.com [avagin@openvz.org: restore the old behaviour of the ns_last_pid sysctl] Link: http://lkml.kernel.org/r/20171106183144.16368-1-avagin@openvz.org Link: http://lkml.kernel.org/r/1507583624-22146-2-git-send-email-gs051095@gmail.com Signed-off-by: Gargi Sharma Reviewed-by: Rik van Riel Acked-by: Oleg Nesterov Cc: Julia Lawall Cc: Ingo Molnar Cc: Pavel Tatashin Cc: Kirill Tkhai Cc: Eric W. Biederman Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 201 +++++++++---------------------------------------- kernel/pid_namespace.c | 53 ++++++------- 2 files changed, 59 insertions(+), 195 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 020dedbdf066..0ce59369632f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -39,6 +39,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -static inline int mk_pid(struct pid_namespace *pid_ns, - struct pidmap *map, int off) -{ - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; -} - -#define find_next_offset(map, off) \ - find_next_zero_bit((map)->page, BITS_PER_PAGE, off) /* * PID-map pages start out as NULL, they get allocated upon @@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns, */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .pidmap = { - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } - }, - .last_pid = 0, + .idr = IDR_INIT, .nr_hashed = PIDNS_HASH_ADDING, .level = 0, .child_reaper = &init_task, @@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static void free_pidmap(struct upid *upid) -{ - int nr = upid->nr; - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; - int offset = nr & BITS_PER_PAGE_MASK; - - clear_bit(offset, map->page); - atomic_inc(&map->nr_free); -} - -/* - * If we started walking pids at 'base', is 'a' seen before 'b'? - */ -static int pid_before(int base, int a, int b) -{ - /* - * This is the same as saying - * - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT - * and that mapping orders 'a' and 'b' with respect to 'base'. - */ - return (unsigned)(a - base) < (unsigned)(b - base); -} - -/* - * We might be racing with someone else trying to set pid_ns->last_pid - * at the pid allocation time (there's also a sysctl for this, but racing - * with this one is OK, see comment in kernel/pid_namespace.c about it). - * We want the winner to have the "later" value, because if the - * "earlier" value prevails, then a pid may get reused immediately. - * - * Since pids rollover, it is not sufficient to just pick the bigger - * value. We have to consider where we started counting from. - * - * 'base' is the value of pid_ns->last_pid that we observed when - * we started looking for a pid. - * - * 'pid' is the pid that we eventually found. - */ -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) -{ - int prev; - int last_write = base; - do { - prev = last_write; - last_write = cmpxchg(&pid_ns->last_pid, prev, pid); - } while ((prev != last_write) && (pid_before(base, last_write, pid))); -} - -static int alloc_pidmap(struct pid_namespace *pid_ns) -{ - int i, offset, max_scan, pid, last = pid_ns->last_pid; - struct pidmap *map; - - pid = last + 1; - if (pid >= pid_max) - pid = RESERVED_PIDS; - offset = pid & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - /* - * If last_pid points into the middle of the map->page we - * want to scan this bitmap block twice, the second time - * we start with offset == 0 (or RESERVED_PIDS). - */ - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; - for (i = 0; i <= max_scan; ++i) { - if (unlikely(!map->page)) { - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irq(&pidmap_lock); - if (!map->page) { - map->page = page; - page = NULL; - } - spin_unlock_irq(&pidmap_lock); - kfree(page); - if (unlikely(!map->page)) - return -ENOMEM; - } - if (likely(atomic_read(&map->nr_free))) { - for ( ; ; ) { - if (!test_and_set_bit(offset, map->page)) { - atomic_dec(&map->nr_free); - set_last_pid(pid_ns, last, pid); - return pid; - } - offset = find_next_offset(map, offset); - if (offset >= BITS_PER_PAGE) - break; - pid = mk_pid(pid_ns, map, offset); - if (pid >= pid_max) - break; - } - } - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { - ++map; - offset = 0; - } else { - map = &pid_ns->pidmap[0]; - offset = RESERVED_PIDS; - if (unlikely(last == offset)) - break; - } - pid = mk_pid(pid_ns, map, offset); - } - return -EAGAIN; -} - -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) -{ - int offset; - struct pidmap *map, *end; - - if (last >= PID_MAX_LIMIT) - return -1; - - offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pid_ns->pidmap[PIDMAP_ENTRIES]; - for (; map < end; map++, offset = 0) { - if (unlikely(!map->page)) - continue; - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); - if (offset < BITS_PER_PAGE) - return mk_pid(pid_ns, map, offset); - } - return -1; -} - void put_pid(struct pid *pid) { struct pid_namespace *ns; @@ -266,7 +124,7 @@ void free_pid(struct pid *pid) struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; hlist_del_rcu(&upid->pid_chain); - switch(--ns->nr_hashed) { + switch (--ns->nr_hashed) { case 2: case 1: /* When all that is left in the pid namespace @@ -284,12 +142,11 @@ void free_pid(struct pid *pid) schedule_work(&ns->proc_work); break; } + + idr_remove(&ns->idr, upid->nr); } spin_unlock_irqrestore(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers + i); - call_rcu(&pid->rcu, delayed_put_pid); } @@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = ns; pid->level = ns->level; + for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); + int pid_min = 1; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&pidmap_lock); + + /* + * init really needs pid 1, but after reaching the maximum + * wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + spin_unlock_irq(&pidmap_lock); + idr_preload_end(); + if (nr < 0) { retval = nr; goto out_free; @@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + /* Make the PID visible to find_pid_ns. */ + idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->nr_hashed++; } spin_unlock_irq(&pidmap_lock); @@ -350,8 +230,11 @@ out_unlock: put_pid_ns(ns); out_free: + spin_lock_irq(&pidmap_lock); while (++i <= ns->level) - free_pidmap(pid->numbers + i); + idr_remove(&ns->idr, (pid->numbers + i)->nr); + + spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns); */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { - struct pid *pid; - - do { - pid = find_pid_ns(nr, ns); - if (pid) - break; - nr = next_pidmap(ns, nr); - } while (nr > 0); - - return pid; + return idr_get_next(&ns->idr, &nr); } /* @@ -578,7 +452,7 @@ void __init pidhash_init(void) 0, 4096); } -void __init pidmap_init(void) +void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); @@ -590,10 +464,7 @@ void __init pidmap_init(void) PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pid_ns.pidmap[0].page); - atomic_dec(&init_pid_ns.pidmap[0].nr_free); + idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 4918314893bc..ca7c8a8823b1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include struct pid_cache { int nr_ids; @@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; struct ucounts *ucounts; - int i; int err; err = -EINVAL; @@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns == NULL) goto out_dec; - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!ns->pidmap[0].page) - goto out_free; + idr_init(&ns->idr); ns->pid_cachep = create_pid_cachep(level + 1); if (ns->pid_cachep == NULL) - goto out_free_map; + goto out_free_idr; err = ns_alloc_inum(&ns->ns); if (err) - goto out_free_map; + goto out_free_idr; ns->ns.ops = &pidns_operations; kref_init(&ns->kref); @@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); - set_bit(0, ns->pidmap[0].page); - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); - - for (i = 1; i < PIDMAP_ENTRIES; i++) - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - return ns; -out_free_map: - kfree(ns->pidmap[0].page); -out_free: +out_free_idr: + idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); out_dec: dec_pid_namespaces(ucounts); @@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { - int i; - ns_free_inum(&ns->ns); - for (i = 0; i < PIDMAP_ENTRIES; i++) - kfree(ns->pidmap[i].page); + + idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } @@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int rc; struct task_struct *task, *me = current; int init_pids = thread_group_leader(me) ? 1 : 2; + struct pid *pid; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * maintain a tasklist for each pid namespace. * */ + rcu_read_lock(); read_lock(&tasklist_lock); - nr = next_pidmap(pid_ns, 1); - while (nr > 0) { - rcu_read_lock(); - - task = pid_task(find_vpid(nr), PIDTYPE_PID); + nr = 2; + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { + task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) send_sig_info(SIGKILL, SEND_SIG_FORCED, task); - - rcu_read_unlock(); - - nr = next_pidmap(pid_ns, nr); } read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. @@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; + int ret, next; if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = &pid_ns->last_pid; - return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + next = idr_get_cursor(&pid_ns->idr) - 1; + + tmp.data = &next; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (!ret && write) + idr_set_cursor(&pid_ns->idr, next + 1); + + return ret; } extern int pid_max; -- cgit v1.2.3 From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001 From: Gargi Sharma Date: Fri, 17 Nov 2017 15:30:34 -0800 Subject: pid: remove pidhash pidhash is no longer required as all the information can be looked up from idr tree. nr_hashed represented the number of pids that had been hashed. Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant, it has been renamed to pid_allocated and PIDNS_ADDING respectively. [gs051095@gmail.com: v6] Link: http://lkml.kernel.org/r/1507760379-21662-3-git-send-email-gs051095@gmail.com Link: http://lkml.kernel.org/r/1507583624-22146-3-git-send-email-gs051095@gmail.com Signed-off-by: Gargi Sharma Reviewed-by: Rik van Riel Tested-by: Tony Luck [ia64] Cc: Julia Lawall Cc: Ingo Molnar Cc: Pavel Tatashin Cc: Kirill Tkhai Cc: Oleg Nesterov Cc: Eric W. Biederman Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/pid.c | 48 ++++++++++-------------------------------------- kernel/pid_namespace.c | 6 +++--- 3 files changed, 14 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4e55eedba8d6..432eadf6b58c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process( retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } - if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_cancel_cgroup; } diff --git a/kernel/pid.c b/kernel/pid.c index 0ce59369632f..b13b624e2c49 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -41,10 +41,6 @@ #include #include -#define pid_hashfn(nr, ns) \ - hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) -static struct hlist_head *pid_hash; -static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -54,7 +50,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; - /* * PID-map pages start out as NULL, they get allocated upon * first use and are never deallocated. This way a low pid_max @@ -64,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT; struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), .idr = IDR_INIT, - .nr_hashed = PIDNS_HASH_ADDING, + .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, @@ -123,8 +118,7 @@ void free_pid(struct pid *pid) for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; - hlist_del_rcu(&upid->pid_chain); - switch (--ns->nr_hashed) { + switch (--ns->pid_allocated) { case 2: case 1: /* When all that is left in the pid namespace @@ -133,10 +127,10 @@ void free_pid(struct pid *pid) */ wake_up_process(ns->child_reaper); break; - case PIDNS_HASH_ADDING: + case PIDNS_ADDING: /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); - ns->nr_hashed = 0; + ns->pid_allocated = 0; /* fall through */ case 0: schedule_work(&ns->proc_work); @@ -212,14 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns) upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) + if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; for ( ; upid >= pid->numbers; --upid) { - hlist_add_head_rcu(&upid->pid_chain, - &pid_hash[pid_hashfn(upid->nr, upid->ns)]); /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); - upid->ns->nr_hashed++; + upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); @@ -243,21 +235,13 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { spin_lock_irq(&pidmap_lock); - ns->nr_hashed &= ~PIDNS_HASH_ADDING; + ns->pid_allocated &= ~PIDNS_ADDING; spin_unlock_irq(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { - struct upid *pnr; - - hlist_for_each_entry_rcu(pnr, - &pid_hash[pid_hashfn(nr, ns)], pid_chain) - if (pnr->nr == nr && pnr->ns == ns) - return container_of(pnr, struct pid, - numbers[ns->level]); - - return NULL; + return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); @@ -413,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (type != PIDTYPE_PID) { if (type == __PIDTYPE_TGID) type = PIDTYPE_PID; + task = task->group_leader; } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); @@ -439,23 +424,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } -/* - * The pid hash table is scaled according to the amount of memory in the - * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or - * more. - */ -void __init pidhash_init(void) -{ - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL | HASH_ZERO, - &pidhash_shift, NULL, - 0, 4096); -} - void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ - BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); + BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ca7c8a8823b1..0b53eef7d34b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; - ns->nr_hashed = PIDNS_HASH_ADDING; + ns->pid_allocated = PIDNS_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); return ns; @@ -254,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * sys_wait4() above can't reap the EXIT_DEAD children but we do not * really care, we could reparent them to the global init. We could * exit and reap ->child_reaper even if it is not the last thread in - * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), + * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), * pid_ns can not go away until proc_kill_sb() drops the reference. * * But this ns can also have other tasks injected by setns()+fork(). @@ -268,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (pid_ns->nr_hashed == init_pids) + if (pid_ns->pid_allocated == init_pids) break; schedule(); } -- cgit v1.2.3 From 4efb442cc12eb66535b7c7ed06005fd7889c1d77 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 17 Nov 2017 15:30:38 -0800 Subject: kernel/panic.c: add TAINT_AUX This is the gist of a patch which we've been forward-porting in our kernels for a long time now and it probably would make a good sense to have such TAINT_AUX flag upstream which can be used by each distro etc, how they see fit. This way, we won't need to forward-port a distro-only version indefinitely. Add an auxiliary taint flag to be used by distros and others. This obviates the need to forward-port whatever internal solutions people have in favor of a single flag which they can map arbitrarily to a definition of their pleasing. The "X" mnemonic could also mean eXternal, which would be taint from a distro or something else but not the upstream kernel. We will use it to mark modules for which we don't provide support. I.e., a really eXternal module. Link: http://lkml.kernel.org/r/20170911134533.dp5mtyku5bongx4c@pd.tnic Signed-off-by: Borislav Petkov Cc: Kees Cook Cc: Jessica Yu Cc: Peter Zijlstra Cc: Jiri Slaby Cc: Jiri Olsa Cc: Michal Marek Cc: Jiri Kosina Cc: Takashi Iwai Cc: Petr Mladek Cc: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 3242b64b1956..2cfef408fec9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -324,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ { 'K', ' ', true }, /* TAINT_LIVEPATCH */ + { 'X', ' ', true }, /* TAINT_AUX */ }; /** @@ -345,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { * 'E' - Unsigned module has been loaded. * 'L' - A soft lockup has previously occurred. * 'K' - Kernel has been live patched. + * 'X' - Auxiliary taint, for distros' use. * * The string is overwritten by the next call to print_tainted(). */ -- cgit v1.2.3 From fcf4edac049a8bca41658970292e2dfdbc9d5f62 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 17 Nov 2017 15:30:42 -0800 Subject: kcov: remove pointless current != NULL check __sanitizer_cov_trace_pc() is a hot code, so it's worth to remove pointless '!current' check. Current is never NULL. Link: http://lkml.kernel.org/r/20170929162221.32500-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Dmitry Vyukov Acked-by: Mark Rutland Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index fc6af9e1308b..d9f9fa9cacc6 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -62,7 +62,7 @@ void notrace __sanitizer_cov_trace_pc(void) * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. */ - if (!t || !in_task()) + if (!in_task()) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { -- cgit v1.2.3 From ded97d2c2b2c5f1dcced0bc57133f7753b037dfc Mon Sep 17 00:00:00 2001 From: Victor Chibotaru Date: Fri, 17 Nov 2017 15:30:46 -0800 Subject: kcov: support comparison operands collection Enables kcov to collect comparison operands from instrumented code. This is done by using Clang's -fsanitize=trace-cmp instrumentation (currently not available for GCC). The comparison operands help a lot in fuzz testing. E.g. they are used in Syzkaller to cover the interiors of conditional statements with way less attempts and thus make previously unreachable code reachable. To allow separate collection of coverage and comparison operands two different work modes are implemented. Mode selection is now done via a KCOV_ENABLE ioctl call with corresponding argument value. Link: http://lkml.kernel.org/r/20171011095459.70721-1-glider@google.com Signed-off-by: Victor Chibotaru Signed-off-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Mark Rutland Cc: Alexander Popov Cc: Andrey Ryabinin Cc: Kees Cook Cc: Vegard Nossum Cc: Quentin Casasnovas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 179 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index d9f9fa9cacc6..15f33faf4013 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -22,13 +22,21 @@ #include #include +/* Number of 64-bit words written per one comparison: */ +#define KCOV_WORDS_PER_CMP 4 + /* * kcov descriptor (one per opened debugfs file). * State transitions of the descriptor: * - initial state after open() * - then there must be a single ioctl(KCOV_INIT_TRACE) call * - then, mmap() call (several calls are allowed but not useful) - * - then, repeated enable/disable for a task (only one task a time allowed) + * - then, ioctl(KCOV_ENABLE, arg), where arg is + * KCOV_TRACE_PC - to trace only the PCs + * or + * KCOV_TRACE_CMP - to trace only the comparison operands + * - then, ioctl(KCOV_DISABLE) to disable the task. + * Enabling/disabling ioctls can be repeated (only one task a time allowed). */ struct kcov { /* @@ -48,51 +56,176 @@ struct kcov { struct task_struct *t; }; -/* - * Entry point from instrumented code. - * This is called once per basic-block/edge. - */ -void notrace __sanitizer_cov_trace_pc(void) +static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { - struct task_struct *t; enum kcov_mode mode; - t = current; /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. */ if (!in_task()) - return; + return false; mode = READ_ONCE(t->kcov_mode); - if (mode == KCOV_MODE_TRACE) { - unsigned long *area; - unsigned long pos; - unsigned long ip = _RET_IP_; + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + return mode == needed_mode; +} +static unsigned long canonicalize_ip(unsigned long ip) +{ #ifdef CONFIG_RANDOMIZE_BASE - ip -= kaslr_offset(); + ip -= kaslr_offset(); #endif + return ip; +} - /* - * There is some code that runs in interrupts but for which - * in_interrupt() returns false (e.g. preempt_schedule_irq()). - * READ_ONCE()/barrier() effectively provides load-acquire wrt - * interrupts, there are paired barrier()/WRITE_ONCE() in - * kcov_ioctl_locked(). - */ - barrier(); - area = t->kcov_area; - /* The first word is number of subsequent PCs. */ - pos = READ_ONCE(area[0]) + 1; - if (likely(pos < t->kcov_size)) { - area[pos] = ip; - WRITE_ONCE(area[0], pos); - } +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void notrace __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + unsigned long *area; + unsigned long ip = canonicalize_ip(_RET_IP_); + unsigned long pos; + + t = current; + if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t)) + return; + + area = t->kcov_area; + /* The first 64-bit word is the number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = ip; + WRITE_ONCE(area[0], pos); } } EXPORT_SYMBOL(__sanitizer_cov_trace_pc); +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS +static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) +{ + struct task_struct *t; + u64 *area; + u64 count, start_index, end_pos, max_pos; + + t = current; + if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t)) + return; + + ip = canonicalize_ip(ip); + + /* + * We write all comparison arguments and types as u64. + * The buffer was allocated for t->kcov_size unsigned longs. + */ + area = (u64 *)t->kcov_area; + max_pos = t->kcov_size * sizeof(unsigned long); + + count = READ_ONCE(area[0]); + + /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */ + start_index = 1 + count * KCOV_WORDS_PER_CMP; + end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64); + if (likely(end_pos <= max_pos)) { + area[start_index] = type; + area[start_index + 1] = arg1; + area[start_index + 2] = arg2; + area[start_index + 3] = ip; + WRITE_ONCE(area[0], count + 1); + } +} + +void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1); + +void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); + +void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4); + +void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8); + +void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1); + +void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); + +void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4); + +void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8); + +void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) +{ + u64 i; + u64 count = cases[0]; + u64 size = cases[1]; + u64 type = KCOV_CMP_CONST; + + switch (size) { + case 8: + type |= KCOV_CMP_SIZE(0); + break; + case 16: + type |= KCOV_CMP_SIZE(1); + break; + case 32: + type |= KCOV_CMP_SIZE(2); + break; + case 64: + type |= KCOV_CMP_SIZE(3); + break; + default: + return; + } + for (i = 0; i < count; i++) + write_comp_data(type, cases[i + 2], val, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_switch); +#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ + static void kcov_get(struct kcov *kcov) { atomic_inc(&kcov->refcount); @@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t) /* Just to not leave dangling references behind. */ kcov_task_init(t); kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; spin_unlock(&kcov->lock); kcov_put(kcov); } @@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) spin_lock(&kcov->lock); size = kcov->size * sizeof(unsigned long); - if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; @@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep) kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); if (!kcov) return -ENOMEM; + kcov->mode = KCOV_MODE_DISABLED; atomic_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; @@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, if (size < 2 || size > INT_MAX / sizeof(unsigned long)) return -EINVAL; kcov->size = size; - kcov->mode = KCOV_MODE_TRACE; + kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: /* @@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, * at task exit or voluntary by KCOV_DISABLE. After that it can * be enabled for another task. */ - unused = arg; - if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || - kcov->area == NULL) + if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; if (kcov->t != NULL) return -EBUSY; + if (arg == KCOV_TRACE_PC) + kcov->mode = KCOV_MODE_TRACE_PC; + else if (arg == KCOV_TRACE_CMP) +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS + kcov->mode = KCOV_MODE_TRACE_CMP; +#else + return -ENOTSUPP; +#endif + else + return -EINVAL; t = current; /* Cache in task struct for performance. */ t->kcov_size = kcov->size; t->kcov_area = kcov->area; - /* See comment in __sanitizer_cov_trace_pc(). */ + /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, kcov->mode); t->kcov = kcov; @@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return -EINVAL; kcov_task_init(t); kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; kcov_put(kcov); return 0; default: -- cgit v1.2.3 From 2d8364bae4db144df75ba85e92d2b8619ba8eedc Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Fri, 17 Nov 2017 15:30:57 -0800 Subject: kernel/reboot.c: add devm_register_reboot_notifier() Add devm_* wrapper around register_reboot_notifier to simplify device specific reboot notifier registration/unregistration. [akpm@linux-foundation.org: move `struct device' forward decl to top-of-file] Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com Signed-off-by: Andrey Smirnov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index bd30a973fe94..e4ced883d8de 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +static void devm_unregister_reboot_notifier(struct device *dev, void *res) +{ + WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res)); +} + +int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb) +{ + struct notifier_block **rcnb; + int ret; + + rcnb = devres_alloc(devm_unregister_reboot_notifier, + sizeof(*rcnb), GFP_KERNEL); + if (!rcnb) + return -ENOMEM; + + ret = register_reboot_notifier(nb); + if (!ret) { + *rcnb = nb; + devres_add(dev, rcnb); + } else { + devres_free(rcnb); + } + + return ret; +} +EXPORT_SYMBOL(devm_register_reboot_notifier); + /* * Notifier list for kernel code which wants to be called * to restart the system. -- cgit v1.2.3 From 13a9c48a85ccf1417b527975c0a12b47fbfaf625 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:51 -0800 Subject: bpf: offload: add comment warning developers about double destroy Offload state may get destroyed either because the device for which it was constructed is going away, or because the refcount of bpf program itself has reached 0. In both of those cases we will call __bpf_prog_offload_destroy() to unlink the offload from the device. We may in fact call it twice, which works just fine, but we should make clear this is intended and caution others trying to extend the function. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2816feb38be1..fd696d3dd429 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -85,6 +85,10 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; + /* Caution - if netdev is destroyed before the program, this function + * will be called twice. + */ + data.offload.prog = prog; if (offload->verifier_running) -- cgit v1.2.3 From 649f11dcd19a5f0d00fdbc760fbdccdd98e56a43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:52 -0800 Subject: bpf: offload: limit offload to cls_bpf and xdp programs only We are currently only allowing attachment of device-bound cls_bpf and XDP programs. Make this restriction explicit in the BPF offload code. This way we can potentially reuse the ifindex field in the future. Since XDP and cls_bpf programs can only be loaded by admin, we can drop the explicit capability check from offload code. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index fd696d3dd429..ac187f9ee182 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -14,8 +14,9 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) struct net *net = current->nsproxy->net_ns; struct bpf_dev_offload *offload; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && + attr->prog_type != BPF_PROG_TYPE_XDP) + return -EINVAL; if (attr->prog_flags) return -EINVAL; -- cgit v1.2.3 From 1f6f4cb7ba219b00a3fa9afe8049fa16444d8b52 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:53 -0800 Subject: bpf: offload: rename the ifindex field bpf_target_prog seems long and clunky, rename it to prog_ifindex. We don't want to call this field just ifindex, because maps may need a similar field in the future and bpf_attr members for programs and maps are unnamed. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 2 +- kernel/bpf/syscall.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ac187f9ee182..a778e5df7e26 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -29,7 +29,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) init_waitqueue_head(&offload->verifier_done); rtnl_lock(); - offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex); + offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); if (!offload->netdev) { rtnl_unlock(); kfree(offload); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 09badc37e864..8e9d065bb7cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1118,7 +1118,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex +#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex static int bpf_prog_load(union bpf_attr *attr) { @@ -1181,7 +1181,7 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; - if (attr->prog_target_ifindex) { + if (attr->prog_ifindex) { err = bpf_prog_offload_init(prog, attr); if (err) goto free_prog; -- cgit v1.2.3 From 288b3de55aace830f13280985ec9e6bcbff33b1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:54 -0800 Subject: bpf: offload: move offload device validation out to the drivers With TC shared block changes we can't depend on correct netdev pointer being available in cls_bpf. Move the device validation to the driver. Core will only make sure that offloaded programs are always attached in the driver (or in HW by the driver). We trust that drivers which implement offload callbacks will perform necessary checks. Moving the checks to the driver is generally a useful thing, in practice the check should be against a switchdev instance, not a netdev, given that most ASICs will probably allow using the same program on many ports. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Jiri Pirko Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8e9d065bb7cd..38da55905ab0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,22 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_can_attach(struct bpf_prog *prog, - enum bpf_prog_type *attach_type, - struct net_device *netdev) +static bool bpf_prog_get_ok(struct bpf_prog *prog, + enum bpf_prog_type *attach_type, bool attach_drv) { - struct bpf_dev_offload *offload = prog->aux->offload; + /* not an attachment, just a refcount inc, always allow */ + if (!attach_type) + return true; if (prog->type != *attach_type) return false; - if (offload && offload->netdev != netdev) + if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) return false; return true; } static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, - struct net_device *netdev) + bool attach_drv) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1080,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) { + if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1093,12 +1094,12 @@ out: struct bpf_prog *bpf_prog_get(u32 ufd) { - return __bpf_prog_get(ufd, NULL, NULL); + return __bpf_prog_get(ufd, NULL, false); } struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); @@ -1107,9 +1108,9 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, - struct net_device *netdev) + bool attach_drv) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); -- cgit v1.2.3 From 479321e9c31a6c05426790b11888427400f75ac8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:56 -0800 Subject: bpf: turn bpf_prog_get_type() into a wrapper bpf_prog_get_type() is identical to bpf_prog_get_type_dev(), with false passed as attach_drv. Instead of keeping it as an exported symbol turn it into static inline wrapper. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 38da55905ab0..41509cf825d8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1097,16 +1097,6 @@ struct bpf_prog *bpf_prog_get(u32 ufd) return __bpf_prog_get(ufd, NULL, false); } -struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) -{ - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false); - - if (!IS_ERR(prog)) - trace_bpf_prog_get_type(prog); - return prog; -} -EXPORT_SYMBOL_GPL(bpf_prog_get_type); - struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { -- cgit v1.2.3 From 62c71b45e8537b8cb746cc929ea05ba0258e0b5a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:57 -0800 Subject: bpf: offload: ignore namespace moves We are currently destroying the device offload state when device moves to another net namespace. This doesn't break with current NFP code, because offload state is not used on program removal, but it's not correct behaviour. Ignore the device unregister notifications on namespace move. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index a778e5df7e26..d4267c674fec 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -174,6 +174,10 @@ static int bpf_offload_notification(struct notifier_block *notifier, switch (event) { case NETDEV_UNREGISTER: + /* ignore namespace changes */ + if (netdev->reg_state != NETREG_UNREGISTERING) + break; + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) { if (offload->netdev == netdev) -- cgit v1.2.3 From 1ee640095f049e7ac4ec36b985abada497b98cc2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:59 -0800 Subject: bpf: revert report offload info to user space This reverts commit bd601b6ada11 ("bpf: report offload info to user space"). The ifindex by itself is not sufficient, we should provide information on which network namespace this ifindex belongs to. After considering some options we concluded that it's best to just remove this API for now, and rework it in -next. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 12 ------------ kernel/bpf/syscall.c | 5 ----- 2 files changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index d4267c674fec..68ec884440b7 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -149,18 +149,6 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } -u32 bpf_prog_offload_ifindex(struct bpf_prog *prog) -{ - struct bpf_dev_offload *offload = prog->aux->offload; - u32 ifindex; - - rtnl_lock(); - ifindex = offload->netdev ? offload->netdev->ifindex : 0; - rtnl_unlock(); - - return ifindex; -} - const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 41509cf825d8..2c4cfeaa8d5e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1616,11 +1616,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } - if (bpf_prog_is_dev_bound(prog->aux)) { - info.status |= BPF_PROG_STATUS_DEV_BOUND; - info.ifindex = bpf_prog_offload_ifindex(prog); - } - done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit v1.2.3 From 24ed960abf1d50cb7834e99a0cfc081bc0656712 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 28 Aug 2017 11:28:21 -0700 Subject: treewide: Switch DEFINE_TIMER callbacks to struct timer_list * This changes all DEFINE_TIMER() callbacks to use a struct timer_list pointer instead of unsigned long. Since the data argument has already been removed, none of these callbacks are using their argument currently, so this renames the argument to "unused". Done using the following semantic patch: @match_define_timer@ declarer name DEFINE_TIMER; identifier _timer, _callback; @@ DEFINE_TIMER(_timer, _callback); @change_callback depends on match_define_timer@ identifier match_define_timer._callback; type _origtype; identifier _origarg; @@ void -_callback(_origtype _origarg) +_callback(struct timer_list *unused) { ... } Signed-off-by: Kees Cook --- kernel/irq/spurious.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 1215229d1c12..ef2a47e0eab6 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -20,7 +20,7 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) -static void poll_spurious_irqs(unsigned long dummy); +static void poll_spurious_irqs(struct timer_list *unused); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); static int irq_poll_cpu; static atomic_t irq_poll_active; @@ -143,7 +143,7 @@ out: return ok; } -static void poll_spurious_irqs(unsigned long dummy) +static void poll_spurious_irqs(struct timer_list *unused) { struct irq_desc *desc; int i; -- cgit v1.2.3 From b9eaf18722221ef8b2bd6a67240ebe668622152a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 13:15:39 -0700 Subject: treewide: init_timer() -> setup_timer() This mechanically converts all remaining cases of ancient open-coded timer setup with the old setup_timer() API, which is the first step in timer conversions. This has no behavioral changes, since it ultimately just changes the order of assignment to fields of struct timer_list when finding variations of: init_timer(&t); f.function = timer_callback; t.data = timer_callback_arg; to be converted into: setup_timer(&t, timer_callback, timer_callback_arg); The conversion is done with the following Coccinelle script, which is an improved version of scripts/cocci/api/setup_timer.cocci, in the following ways: - assignments-before-init_timer() cases - limit the .data case removal to the specific struct timer_list instance - handling calls by dereference (timer->field vs timer.field) spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/setup_timer.cocci @fix_address_of@ expression e; @@ init_timer( -&(e) +&e , ...) // Match the common cases first to avoid Coccinelle parsing loops with // "... when" clauses. @match_immediate_function_data_after_init_timer@ expression e, func, da; @@ -init_timer +setup_timer ( \(&e\|e\) +, func, da ); ( -\(e.function\|e->function\) = func; -\(e.data\|e->data\) = da; | -\(e.data\|e->data\) = da; -\(e.function\|e->function\) = func; ) @match_immediate_function_data_before_init_timer@ expression e, func, da; @@ ( -\(e.function\|e->function\) = func; -\(e.data\|e->data\) = da; | -\(e.data\|e->data\) = da; -\(e.function\|e->function\) = func; ) -init_timer +setup_timer ( \(&e\|e\) +, func, da ); @match_function_and_data_after_init_timer@ expression e, e2, e3, e4, e5, func, da; @@ -init_timer +setup_timer ( \(&e\|e\) +, func, da ); ... when != func = e2 when != da = e3 ( -e.function = func; ... when != da = e4 -e.data = da; | -e->function = func; ... when != da = e4 -e->data = da; | -e.data = da; ... when != func = e5 -e.function = func; | -e->data = da; ... when != func = e5 -e->function = func; ) @match_function_and_data_before_init_timer@ expression e, e2, e3, e4, e5, func, da; @@ ( -e.function = func; ... when != da = e4 -e.data = da; | -e->function = func; ... when != da = e4 -e->data = da; | -e.data = da; ... when != func = e5 -e.function = func; | -e->data = da; ... when != func = e5 -e->function = func; ) ... when != func = e2 when != da = e3 -init_timer +setup_timer ( \(&e\|e\) +, func, da ); @r1 exists@ expression t; identifier f; position p; @@ f(...) { ... when any init_timer@p(\(&t\|t\)) ... when any } @r2 exists@ expression r1.t; identifier g != r1.f; expression e8; @@ g(...) { ... when any \(t.data\|t->data\) = e8 ... when any } // It is dangerous to use setup_timer if data field is initialized // in another function. @script:python depends on r2@ p << r1.p; @@ cocci.include_match(False) @r3@ expression r1.t, func, e7; position r1.p; @@ ( -init_timer@p(&t); +setup_timer(&t, func, 0UL); ... when != func = e7 -t.function = func; | -t.function = func; ... when != func = e7 -init_timer@p(&t); +setup_timer(&t, func, 0UL); | -init_timer@p(t); +setup_timer(t, func, 0UL); ... when != func = e7 -t->function = func; | -t->function = func; ... when != func = e7 -init_timer@p(t); +setup_timer(t, func, 0UL); ) Signed-off-by: Kees Cook --- kernel/time/clocksource.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 03918a19cf2d..5b51d5ba2a85 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -290,8 +290,7 @@ static inline void clocksource_start_watchdog(void) { if (watchdog_running || !watchdog || list_empty(&watchdog_list)) return; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; + setup_timer(&watchdog_timer, clocksource_watchdog, 0UL); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); watchdog_running = 1; -- cgit v1.2.3 From e99e88a9d2b067465adaa9c111ada99a041bef9a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 14:43:17 -0700 Subject: treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook --- kernel/padata.c | 6 +++--- kernel/time/clocksource.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index f262c9a4e70a..57c0074d50cc 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -288,9 +288,9 @@ static void invoke_padata_reorder(struct work_struct *work) local_bh_enable(); } -static void padata_reorder_timer(unsigned long arg) +static void padata_reorder_timer(struct timer_list *t) { - struct parallel_data *pd = (struct parallel_data *)arg; + struct parallel_data *pd = from_timer(pd, t, timer); unsigned int weight; int target_cpu, cpu; @@ -485,7 +485,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); - setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); + timer_setup(&pd->timer, padata_reorder_timer, 0); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 5b51d5ba2a85..65f9e3f24dde 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -171,7 +171,7 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -static void clocksource_watchdog(unsigned long data) +static void clocksource_watchdog(struct timer_list *unused) { struct clocksource *cs; u64 csnow, wdnow, cslast, wdlast, delta; @@ -290,7 +290,7 @@ static inline void clocksource_start_watchdog(void) { if (watchdog_running || !watchdog || list_empty(&watchdog_list)) return; - setup_timer(&watchdog_timer, clocksource_watchdog, 0UL); + timer_setup(&watchdog_timer, clocksource_watchdog, 0); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); watchdog_running = 1; -- cgit v1.2.3 From c1eba5bcb6430868427e0b9d1cd1205a07302f06 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 18:18:19 -0700 Subject: timer: Pass timer_list pointer to callbacks unconditionally Now that all timer callbacks are already taking their struct timer_list pointer as the callback argument, just do this unconditionally and remove the .data field. Cc: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Signed-off-by: Kees Cook --- kernel/time/timer.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index af0b8bae4502..a07eb124332f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1107,12 +1107,12 @@ EXPORT_SYMBOL(timer_reduce); * add_timer - start a timer * @timer: the timer to be added * - * The kernel will do a ->function(->data) callback from the + * The kernel will do a ->function(@timer) callback from the * timer interrupt at the ->expires point in the future. The * current time is 'jiffies'. * - * The timer's ->expires, ->function (and if the handler uses it, ->data) - * fields must be set prior calling this function. + * The timer's ->expires, ->function fields must be set prior calling this + * function. * * Timers with an ->expires field in the past will be executed in the next * timer tick. @@ -1284,8 +1284,7 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), - unsigned long data) +static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long)) { int count = preempt_count(); @@ -1309,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), lock_map_acquire(&lockdep_map); trace_timer_expire_entry(timer); - fn(data); + fn((TIMER_DATA_TYPE)timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1332,7 +1331,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(unsigned long); - unsigned long data; timer = hlist_entry(head->first, struct timer_list, entry); @@ -1340,15 +1338,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) detach_timer(timer, true); fn = timer->function; - data = timer->data; if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); - call_timer_fn(timer, fn, data); + call_timer_fn(timer, fn); raw_spin_lock(&base->lock); } else { raw_spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); + call_timer_fn(timer, fn); raw_spin_lock_irq(&base->lock); } } -- cgit v1.2.3 From 354b46b1a0adda1dd5b7f0bc2a5604cca091be5f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 19:15:40 -0700 Subject: timer: Switch callback prototype to take struct timer_list * argument Since all callbacks have been converted, we can switch the core prototype to "struct timer_list *" now too. Cc: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Signed-off-by: Kees Cook --- kernel/time/timer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a07eb124332f..0f0d49a02d04 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1284,7 +1284,7 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long)) +static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) { int count = preempt_count(); @@ -1308,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long)) lock_map_acquire(&lockdep_map); trace_timer_expire_entry(timer); - fn((TIMER_DATA_TYPE)timer); + fn(timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1330,7 +1330,7 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) { while (!hlist_empty(head)) { struct timer_list *timer; - void (*fn)(unsigned long); + void (*fn)(struct timer_list *); timer = hlist_entry(head->first, struct timer_list, entry); -- cgit v1.2.3 From 188665b2d67db8953899551d1a9d4481b2a0ac60 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 18:14:46 -0700 Subject: timer: Pass function down to initialization routines In preparation for removing more macros, pass the function down to the initialization routines instead of doing it in macros. Cc: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Signed-off-by: Kees Cook --- kernel/time/timer.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0f0d49a02d04..ffebcf878fba 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -707,14 +707,18 @@ static inline void debug_timer_assert_init(struct timer_list *timer) debug_object_assert_init(timer, &timer_debug_descr); } -static void do_init_timer(struct timer_list *timer, unsigned int flags, +static void do_init_timer(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, +void init_timer_on_stack_key(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, const char *name, struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); - do_init_timer(timer, flags, name, key); + do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL_GPL(init_timer_on_stack_key); @@ -755,10 +759,13 @@ static inline void debug_assert_init(struct timer_list *timer) debug_timer_assert_init(timer); } -static void do_init_timer(struct timer_list *timer, unsigned int flags, +static void do_init_timer(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, const char *name, struct lock_class_key *key) { timer->entry.pprev = NULL; + timer->function = func; timer->flags = flags | raw_smp_processor_id(); lockdep_init_map(&timer->lockdep_map, name, key, 0); } @@ -766,6 +773,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, /** * init_timer_key - initialize a timer * @timer: the timer to be initialized + * @func: timer callback function * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer @@ -774,11 +782,12 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, * init_timer_key() must be done to a timer prior calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, unsigned int flags, +void init_timer_key(struct timer_list *timer, + void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); - do_init_timer(timer, flags, name, key); + do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL(init_timer_key); -- cgit v1.2.3 From 841b86f3289dbe858daeceec36423d4ea286fac2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 23 Oct 2017 09:40:42 +0200 Subject: treewide: Remove TIMER_FUNC_TYPE and TIMER_DATA_TYPE casts With all callbacks converted, and the timer callback prototype switched over, the TIMER_FUNC_TYPE cast is no longer needed, so remove it. Conversion was done with the following scripts: perl -pi -e 's|\(TIMER_FUNC_TYPE\)||g' \ $(git grep TIMER_FUNC_TYPE | cut -d: -f1 | sort -u) perl -pi -e 's|\(TIMER_DATA_TYPE\)||g' \ $(git grep TIMER_DATA_TYPE | cut -d: -f1 | sort -u) The now unused macros are also dropped from include/linux/timer.h. Signed-off-by: Kees Cook --- kernel/kthread.c | 2 +- kernel/workqueue.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 8af313081b0d..cd50e99202b0 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -843,7 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker, struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; - WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dde6298f6b22..8fdb710bfdd7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1509,7 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); - WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); -- cgit v1.2.3 From db1ac4964fa172803a0fea83033cd35d380a8a77 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Wed, 22 Nov 2017 18:32:53 +0000 Subject: bpf: introduce ARG_PTR_TO_MEM_OR_NULL With the current ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM semantics, an helper argument can be NULL when the next argument type is ARG_CONST_SIZE_OR_ZERO and the verifier can prove the value of this next argument is 0. However, most helpers are just interested in handling , so forcing them to deal with makes the implementation of those helpers more complicated for no apparent benefits, requiring them to explicitly handle those corner cases with checks that bpf programs could start relying upon, preventing the possibility of removing them later. Solve this by making ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM never accept NULL even when ARG_CONST_SIZE_OR_ZERO is set, and introduce a new argument type ARG_PTR_TO_MEM_OR_NULL to explicitly deal with the NULL case. Currently, the only helper that needs this is bpf_csum_diff_proto(), so change arg1 and arg3 to this new type as well. Also add a new battery of tests that explicitly test the !ARG_PTR_TO_MEM_OR_NULL combination: all the current ones testing the various variations are focused on bpf_csum_diff, so cover also other helpers. Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dd54d20ace2f..308b0638ec5d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1384,13 +1384,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (type != expected_type) goto err_type; } else if (arg_type == ARG_PTR_TO_MEM || + arg_type == ARG_PTR_TO_MEM_OR_NULL || arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(*reg)) + if (register_is_null(*reg) && + arg_type == ARG_PTR_TO_MEM_OR_NULL) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && -- cgit v1.2.3 From eb33f2cca49ec49a1b893b5af546e7c042ca6365 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Wed, 22 Nov 2017 18:32:54 +0000 Subject: bpf: remove explicit handling of 0 for arg2 in bpf_probe_read Commit 9c019e2bc4b2 ("bpf: change helper bpf_probe_read arg2 type to ARG_CONST_SIZE_OR_ZERO") changed arg2 type to ARG_CONST_SIZE_OR_ZERO to simplify writing bpf programs by taking advantage of the new semantics introduced for ARG_CONST_SIZE_OR_ZERO which allows arguments. In order to prevent the helper from actually passing a NULL pointer to probe_kernel_read, which can happen when is passed to the helper, the commit also introduced an explicit check against size == 0. After the recent introduction of the ARG_PTR_TO_MEM_OR_NULL type, bpf_probe_read can not receive a pair of arguments anymore, thus the check is not needed anymore and can be removed, since probe_kernel_read can correctly handle a call. This also fixes the semantics of the helper before it gets officially released and bpf programs start relying on this check. Fixes: 9c019e2bc4b2 ("bpf: change helper bpf_probe_read arg2 type to ARG_CONST_SIZE_OR_ZERO") Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a5580c670866..728909f7951c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -78,16 +78,12 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { - int ret = 0; - - if (unlikely(size == 0)) - goto out; + int ret; ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); - out: return ret; } -- cgit v1.2.3 From 5c4e1201740ceae9bd6f622851a9bf7c66debe3a Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Wed, 22 Nov 2017 18:32:55 +0000 Subject: bpf: change bpf_probe_read_str arg2 type to ARG_CONST_SIZE_OR_ZERO Commit 9fd29c08e520 ("bpf: improve verifier ARG_CONST_SIZE_OR_ZERO semantics") relaxed the treatment of ARG_CONST_SIZE_OR_ZERO due to the way the compiler generates optimized BPF code when checking boundaries of an argument from C code. A typical example of this optimized code can be generated using the bpf_probe_read_str helper when operating on variable memory: /* len is a generic scalar */ if (len > 0 && len <= 0x7fff) bpf_probe_read_str(p, len, s); 251: (79) r1 = *(u64 *)(r10 -88) 252: (07) r1 += -1 253: (25) if r1 > 0x7ffe goto pc-42 254: (bf) r1 = r7 255: (79) r2 = *(u64 *)(r10 -88) 256: (bf) r8 = r4 257: (85) call bpf_probe_read_str#45 R2 min value is negative, either use unsigned or 'var &= const' With this code, the verifier loses track of the variable. Replacing arg2 with ARG_CONST_SIZE_OR_ZERO is thus desirable since it avoids this quite common case which leads to usability issues, and the compiler generates code that the verifier can more easily test: if (len <= 0x7fff) bpf_probe_read_str(p, len, s); or bpf_probe_read_str(p, len & 0x7fff, s); No changes to the bpf_probe_read_str helper are necessary since strncpy_from_unsafe itself immediately returns if the size passed is 0. Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 728909f7951c..ed8601a1a861 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -494,7 +494,7 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; -- cgit v1.2.3 From a60dd35d2e39209fa7645945e1192bf9769872c6 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Wed, 22 Nov 2017 18:32:56 +0000 Subject: bpf: change bpf_perf_event_output arg5 type to ARG_CONST_SIZE_OR_ZERO Commit 9fd29c08e520 ("bpf: improve verifier ARG_CONST_SIZE_OR_ZERO semantics") relaxed the treatment of ARG_CONST_SIZE_OR_ZERO due to the way the compiler generates optimized BPF code when checking boundaries of an argument from C code. A typical example of this optimized code can be generated using the bpf_perf_event_output helper when operating on variable memory: /* len is a generic scalar */ if (len > 0 && len <= 0x7fff) bpf_perf_event_output(ctx, &perf_map, 0, buf, len); 110: (79) r5 = *(u64 *)(r10 -40) 111: (bf) r1 = r5 112: (07) r1 += -1 113: (25) if r1 > 0x7ffe goto pc+6 114: (bf) r1 = r6 115: (18) r2 = 0xffff94e5f166c200 117: (b7) r3 = 0 118: (bf) r4 = r7 119: (85) call bpf_perf_event_output#25 R5 min value is negative, either use unsigned or 'var &= const' With this code, the verifier loses track of the variable. Replacing arg5 with ARG_CONST_SIZE_OR_ZERO is thus desirable since it avoids this quite common case which leads to usability issues, and the compiler generates code that the verifier can more easily test: if (len <= 0x7fff) bpf_perf_event_output(ctx, &perf_map, 0, buf, len); or bpf_perf_event_output(ctx, &perf_map, 0, buf, len & 0x7fff); No changes to the bpf_perf_event_output helper are necessary since it can handle a case where size is 0, and an empty frame is pushed. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ed8601a1a861..27d1f4ffa3de 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -403,7 +403,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); @@ -605,7 +605,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, -- cgit v1.2.3 From c131187db2d3fa2f8bf32fdf4e9a4ef805168467 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 22 Nov 2017 16:42:05 -0800 Subject: bpf: fix branch pruning logic when the verifier detects that register contains a runtime constant and it's compared with another constant it will prune exploration of the branch that is guaranteed not to be taken at runtime. This is all correct, but malicious program may be constructed in such a way that it always has a constant comparison and the other branch is never taken under any conditions. In this case such path through the program will not be explored by the verifier. It won't be taken at run-time either, but since all instructions are JITed the malicious program may cause JITs to complain about using reserved fields, etc. To fix the issue we have to track the instructions explored by the verifier and sanitize instructions that are dead at run time with NOPs. We cannot reject such dead code, since llvm generates it for valid C code, since it doesn't do as much data flow analysis as the verifier does. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 308b0638ec5d..d4593571c404 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3827,6 +3827,7 @@ static int do_check(struct bpf_verifier_env *env) return err; regs = cur_regs(env); + env->insn_aux_data[insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -4022,6 +4023,7 @@ process_bpf_exit: return err; insn_idx++; + env->insn_aux_data[insn_idx].seen = true; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -4204,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, u32 off, u32 cnt) { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + int i; if (cnt == 1) return 0; @@ -4213,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + for (i = off; i < off + cnt - 1; i++) + new_data[i].seen = true; env->insn_aux_data = new_data; vfree(old_data); return 0; @@ -4231,6 +4236,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } +/* The verifier does more data flow analysis than llvm and will not explore + * branches that are dead at run time. Malicious programs can have dead code + * too. Therefore replace all dead at-run-time code with nops. + */ +static void sanitize_dead_code(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++) { + if (aux_data[i].seen) + continue; + memcpy(insn + i, &nop, sizeof(nop)); + } +} + /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -4557,6 +4581,9 @@ skip_full_check: while (!pop_stack(env, NULL, NULL)); free_states(env); + if (ret == 0) + sanitize_dead_code(env); + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); -- cgit v1.2.3 From 75f1133873d6a1276d3c19918b7c94975840f990 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 22 Nov 2017 12:56:45 -0800 Subject: genirq/matrix: Make - vs ?: Precedence explicit Noticed with a Clang build. This improves the readability of the ?: expression, as it has lower precedence than the - expression. Show explicitly that - is evaluated first. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20171122205645.GA27125@beast --- kernel/irq/matrix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index a3cbbc8191c5..7df2480005f8 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -384,7 +384,7 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown) { struct cpumap *cm = this_cpu_ptr(m->maps); - return m->global_available - cpudown ? cm->available : 0; + return (m->global_available - cpudown) ? cm->available : 0; } /** -- cgit v1.2.3