From b032132c3c218f4a09e9499b3674299a752581c6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 30 Jul 2016 13:53:37 -0500 Subject: userns: Free user namespaces in process context Add the necessary boiler plate to move freeing of user namespaces into work queue and thus into process context where things can sleep. This is a necessary precursor to per user namespace sysctls. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 68f594212759..5247cdb24e62 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -29,6 +29,7 @@ static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); +static void free_user_ns(struct work_struct *work); static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { @@ -101,6 +102,7 @@ int create_user_ns(struct cred *new) ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; + INIT_WORK(&ns->work, free_user_ns); /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); @@ -135,9 +137,10 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) return err; } -void free_user_ns(struct user_namespace *ns) +static void free_user_ns(struct work_struct *work) { - struct user_namespace *parent; + struct user_namespace *parent, *ns = + container_of(work, struct user_namespace, work); do { parent = ns->parent; @@ -149,7 +152,12 @@ void free_user_ns(struct user_namespace *ns) ns = parent; } while (atomic_dec_and_test(&parent->count)); } -EXPORT_SYMBOL(free_user_ns); + +void __put_user_ns(struct user_namespace *ns) +{ + schedule_work(&ns->work); +} +EXPORT_SYMBOL(__put_user_ns); static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { -- cgit v1.2.3 From dbec28460a89aa7c02c3301e9e108d98272549d2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 30 Jul 2016 13:58:49 -0500 Subject: userns: Add per user namespace sysctls. Limit per userns sysctls to only be opened for write by a holder of CAP_SYS_RESOURCE. Add all of the necessary boilerplate for having per user namespace sysctls. Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/Makefile | 2 +- kernel/ucount.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/user_namespace.c | 18 ++++++++- 3 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 kernel/ucount.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index e2ec54e2b952..eb26e12c6c2a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o + async.o range.o smpboot.o ucount.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/ucount.c b/kernel/ucount.c new file mode 100644 index 000000000000..cbde1dc87851 --- /dev/null +++ b/kernel/ucount.c @@ -0,0 +1,99 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include +#include + +#ifdef CONFIG_SYSCTL +static struct ctl_table_set * +set_lookup(struct ctl_table_root *root) +{ + return ¤t_user_ns()->set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t_user_ns()->set == set; +} + +static int set_permissions(struct ctl_table_header *head, + struct ctl_table *table) +{ + struct user_namespace *user_ns = + container_of(head->set, struct user_namespace, set); + int mode; + + /* Allow users with CAP_SYS_RESOURCE unrestrained access */ + if (ns_capable(user_ns, CAP_SYS_RESOURCE)) + mode = (table->mode & S_IRWXU) >> 6; + else + /* Allow all others at most read-only access */ + mode = table->mode & S_IROTH; + return (mode << 6) | (mode << 3) | mode; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, + .permissions = set_permissions, +}; + +static struct ctl_table userns_table[] = { + { } +}; +#endif /* CONFIG_SYSCTL */ + +bool setup_userns_sysctls(struct user_namespace *ns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + setup_sysctl_set(&ns->set, &set_root, set_is_seen); + tbl = kmemdup(userns_table, sizeof(userns_table), GFP_KERNEL); + if (tbl) { + ns->sysctls = __register_sysctl_table(&ns->set, "userns", tbl); + } + if (!ns->sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->set); + return false; + } +#endif + return true; +} + +void retire_userns_sysctls(struct user_namespace *ns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + tbl = ns->sysctls->ctl_table_arg; + unregister_sysctl_table(ns->sysctls); + retire_sysctl_set(&ns->set); + kfree(tbl); +#endif +} + +static __init int user_namespace_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + static struct ctl_table_header *userns_header; + static struct ctl_table empty[1]; + /* + * It is necessary to register the userns directory in the + * default set so that registrations in the child sets work + * properly. + */ + userns_header = register_sysctl("userns", empty); + BUG_ON(!userns_header); + BUG_ON(!setup_userns_sysctls(&init_user_ns)); +#endif + return 0; +} +subsys_initcall(user_namespace_sysctl_init); + + diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 5247cdb24e62..a63332253c7e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -23,6 +23,9 @@ #include #include +extern bool setup_userns_sysctls(struct user_namespace *ns); +extern void retire_userns_sysctls(struct user_namespace *ns); + static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); @@ -109,12 +112,22 @@ int create_user_ns(struct cred *new) ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); - set_cred_user_ns(new, ns); - #ifdef CONFIG_PERSISTENT_KEYRINGS init_rwsem(&ns->persistent_keyring_register_sem); #endif + ret = -ENOMEM; + if (!setup_userns_sysctls(ns)) + goto fail_keyring; + + set_cred_user_ns(new, ns); return 0; +fail_keyring: +#ifdef CONFIG_PERSISTENT_KEYRINGS + key_put(ns->persistent_keyring_register); +#endif + ns_free_inum(&ns->ns); + kmem_cache_free(user_ns_cachep, ns); + return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) @@ -144,6 +157,7 @@ static void free_user_ns(struct work_struct *work) do { parent = ns->parent; + retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif -- cgit v1.2.3 From b376c3e1b6770ddcb4f0782be16358095fcea0b6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 13:41:24 -0500 Subject: userns: Add a limit on the number of user namespaces Export the export the maximum number of user namespaces as /proc/sys/userns/max_user_namespaces. Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 2 ++ kernel/ucount.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/user_namespace.c | 31 +++++++++++++++++++---------- 3 files changed, 75 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 52e725d4a866..daa6a82b4900 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -321,6 +321,8 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + init_user_ns.max_user_namespaces = max_threads; } int __weak arch_dup_task_struct(struct task_struct *dst, diff --git a/kernel/ucount.c b/kernel/ucount.c index cbde1dc87851..6c2205c0befd 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -43,7 +43,18 @@ static struct ctl_table_root set_root = { .permissions = set_permissions, }; +static int zero = 0; +static int int_max = INT_MAX; static struct ctl_table userns_table[] = { + { + .procname = "max_user_namespaces", + .data = &init_user_ns.max_user_namespaces, + .maxlen = sizeof(init_user_ns.max_user_namespaces), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, + }, { } }; #endif /* CONFIG_SYSCTL */ @@ -55,6 +66,8 @@ bool setup_userns_sysctls(struct user_namespace *ns) setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(userns_table, sizeof(userns_table), GFP_KERNEL); if (tbl) { + tbl[0].data = &ns->max_user_namespaces; + ns->sysctls = __register_sysctl_table(&ns->set, "userns", tbl); } if (!ns->sysctls) { @@ -78,6 +91,46 @@ void retire_userns_sysctls(struct user_namespace *ns) #endif } +static inline bool atomic_inc_below(atomic_t *v, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c >= u)) + return false; + old = atomic_cmpxchg(v, c, c+1); + if (likely(old == c)) + return true; + c = old; + } +} + +bool inc_user_namespaces(struct user_namespace *ns) +{ + struct user_namespace *pos, *bad; + for (pos = ns; pos; pos = pos->parent) { + int max = READ_ONCE(pos->max_user_namespaces); + if (!atomic_inc_below(&pos->user_namespaces, max)) + goto fail; + } + return true; +fail: + bad = pos; + for (pos = ns; pos != bad; pos = pos->parent) + atomic_dec(&pos->user_namespaces); + + return false; +} + +void dec_user_namespaces(struct user_namespace *ns) +{ + struct user_namespace *pos; + for (pos = ns; pos; pos = pos->parent) { + int dec = atomic_dec_if_positive(&pos->user_namespaces); + WARN_ON_ONCE(dec < 0); + } +} + static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a63332253c7e..7d87017a0040 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -23,9 +23,6 @@ #include #include -extern bool setup_userns_sysctls(struct user_namespace *ns); -extern void retire_userns_sysctls(struct user_namespace *ns); - static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); @@ -34,6 +31,7 @@ static bool new_idmap_permitted(const struct file *file, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); + static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing @@ -68,8 +66,12 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + ret = -EUSERS; if (parent_ns->level > 32) - return -EUSERS; + goto fail; + + if (!inc_user_namespaces(parent_ns)) + goto fail; /* * Verify that we can not violate the policy of which files @@ -77,26 +79,27 @@ int create_user_ns(struct cred *new) * by verifing that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ + ret = -EPERM; if (current_chrooted()) - return -EPERM; + goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ + ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) - return -EPERM; + goto fail_dec; + ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) - return -ENOMEM; + goto fail_dec; ret = ns_alloc_inum(&ns->ns); - if (ret) { - kmem_cache_free(user_ns_cachep, ns); - return ret; - } + if (ret) + goto fail_free; ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); @@ -106,6 +109,7 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); + ns->max_user_namespaces = INT_MAX; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); @@ -126,7 +130,11 @@ fail_keyring: key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); +fail_free: kmem_cache_free(user_ns_cachep, ns); +fail_dec: + dec_user_namespaces(parent_ns); +fail: return ret; } @@ -163,6 +171,7 @@ static void free_user_ns(struct work_struct *work) #endif ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); + dec_user_namespaces(parent); ns = parent; } while (atomic_dec_and_test(&parent->count)); } -- cgit v1.2.3 From f6b2db1a3e8d141dd144df58900fb0444d5d7c53 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 13:54:50 -0500 Subject: userns: Make the count of user namespaces per user Add a structure that is per user and per user ns and use it to hold the count of user namespaces. This makes prevents one user from creating denying service to another user by creating the maximum number of user namespaces. Rename the sysctl export of the maximum count from /proc/sys/userns/max_user_namespaces to /proc/sys/user/max_user_namespaces to reflect that the count is now per user. Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 2 +- kernel/ucount.c | 116 +++++++++++++++++++++++++++++++++++++++--------- kernel/user_namespace.c | 11 +++-- 3 files changed, 103 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index daa6a82b4900..d8cde533ace3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -322,7 +322,7 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - init_user_ns.max_user_namespaces = max_threads; + init_user_ns.max_user_namespaces = max_threads/2; } int __weak arch_dup_task_struct(struct task_struct *dst, diff --git a/kernel/ucount.c b/kernel/ucount.c index 6c2205c0befd..33c418718304 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -8,8 +8,20 @@ #include #include #include +#include #include +#define UCOUNTS_HASHTABLE_BITS 10 +static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +static DEFINE_SPINLOCK(ucounts_lock); + +#define ucounts_hashfn(ns, uid) \ + hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ + UCOUNTS_HASHTABLE_BITS) +#define ucounts_hashentry(ns, uid) \ + (ucounts_hashtable + ucounts_hashfn(ns, uid)) + + #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) @@ -45,7 +57,7 @@ static struct ctl_table_root set_root = { static int zero = 0; static int int_max = INT_MAX; -static struct ctl_table userns_table[] = { +static struct ctl_table user_table[] = { { .procname = "max_user_namespaces", .data = &init_user_ns.max_user_namespaces, @@ -64,11 +76,11 @@ bool setup_userns_sysctls(struct user_namespace *ns) #ifdef CONFIG_SYSCTL struct ctl_table *tbl; setup_sysctl_set(&ns->set, &set_root, set_is_seen); - tbl = kmemdup(userns_table, sizeof(userns_table), GFP_KERNEL); + tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { tbl[0].data = &ns->max_user_namespaces; - ns->sysctls = __register_sysctl_table(&ns->set, "userns", tbl); + ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl); } if (!ns->sysctls) { kfree(tbl); @@ -91,6 +103,61 @@ void retire_userns_sysctls(struct user_namespace *ns) #endif } +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +{ + struct ucounts *ucounts; + + hlist_for_each_entry(ucounts, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) + return ucounts; + } + return NULL; +} + +static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) +{ + struct hlist_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; + + spin_lock(&ucounts_lock); + ucounts = find_ucounts(ns, uid, hashent); + if (!ucounts) { + spin_unlock(&ucounts_lock); + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + atomic_set(&new->count, 0); + + spin_lock(&ucounts_lock); + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) { + kfree(new); + } else { + hlist_add_head(&new->node, hashent); + ucounts = new; + } + } + if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) + ucounts = NULL; + spin_unlock(&ucounts_lock); + return ucounts; +} + +static void put_ucounts(struct ucounts *ucounts) +{ + if (atomic_dec_and_test(&ucounts->count)) { + spin_lock(&ucounts_lock); + hlist_del_init(&ucounts->node); + spin_unlock(&ucounts_lock); + + kfree(ucounts); + } +} + static inline bool atomic_inc_below(atomic_t *v, int u) { int c, old; @@ -105,44 +172,51 @@ static inline bool atomic_inc_below(atomic_t *v, int u) } } -bool inc_user_namespaces(struct user_namespace *ns) +struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { - struct user_namespace *pos, *bad; - for (pos = ns; pos; pos = pos->parent) { - int max = READ_ONCE(pos->max_user_namespaces); - if (!atomic_inc_below(&pos->user_namespaces, max)) + struct ucounts *ucounts, *iter, *bad; + struct user_namespace *tns; + ucounts = get_ucounts(ns, uid); + for (iter = ucounts; iter; iter = tns->ucounts) { + int max; + tns = iter->ns; + max = READ_ONCE(tns->max_user_namespaces); + if (!atomic_inc_below(&iter->user_namespaces, max)) goto fail; } - return true; + return ucounts; fail: - bad = pos; - for (pos = ns; pos != bad; pos = pos->parent) - atomic_dec(&pos->user_namespaces); + bad = iter; + for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) + atomic_dec(&iter->user_namespaces); - return false; + put_ucounts(ucounts); + return NULL; } -void dec_user_namespaces(struct user_namespace *ns) +void dec_user_namespaces(struct ucounts *ucounts) { - struct user_namespace *pos; - for (pos = ns; pos; pos = pos->parent) { - int dec = atomic_dec_if_positive(&pos->user_namespaces); + struct ucounts *iter; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + int dec = atomic_dec_if_positive(&iter->user_namespaces); WARN_ON_ONCE(dec < 0); } + put_ucounts(ucounts); } + static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL - static struct ctl_table_header *userns_header; + static struct ctl_table_header *user_header; static struct ctl_table empty[1]; /* - * It is necessary to register the userns directory in the + * It is necessary to register the user directory in the * default set so that registrations in the child sets work * properly. */ - userns_header = register_sysctl("userns", empty); - BUG_ON(!userns_header); + user_header = register_sysctl("user", empty); + BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif return 0; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 7d87017a0040..58c67e5f851c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -31,7 +31,6 @@ static bool new_idmap_permitted(const struct file *file, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); - static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing @@ -64,13 +63,15 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; + struct ucounts *ucounts; int ret; ret = -EUSERS; if (parent_ns->level > 32) goto fail; - if (!inc_user_namespaces(parent_ns)) + ucounts = inc_user_namespaces(parent_ns, owner); + if (!ucounts) goto fail; /* @@ -110,6 +111,7 @@ int create_user_ns(struct cred *new) ns->group = group; INIT_WORK(&ns->work, free_user_ns); ns->max_user_namespaces = INT_MAX; + ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); @@ -133,7 +135,7 @@ fail_keyring: fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: - dec_user_namespaces(parent_ns); + dec_user_namespaces(ucounts); fail: return ret; } @@ -164,6 +166,7 @@ static void free_user_ns(struct work_struct *work) container_of(work, struct user_namespace, work); do { + struct ucounts *ucounts = ns->ucounts; parent = ns->parent; retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS @@ -171,7 +174,7 @@ static void free_user_ns(struct work_struct *work) #endif ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); - dec_user_namespaces(parent); + dec_user_namespaces(ucounts); ns = parent; } while (atomic_dec_and_test(&parent->count)); } -- cgit v1.2.3 From 25f9c0817c535a728c1088542230fa327c577c9e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 14:41:52 -0500 Subject: userns: Generalize the user namespace count into ucount The same kind of recursive sane default limit and policy countrol that has been implemented for the user namespace is desirable for the other namespaces, so generalize the user namespace refernce count into a ucount. Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 5 ++++- kernel/ucount.c | 39 +++++++++++++++++++++------------------ kernel/user_namespace.c | 16 ++++++++++++++-- 3 files changed, 39 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d8cde533ace3..3cb4853a59aa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -302,6 +302,7 @@ int arch_task_struct_size __read_mostly; void __init fork_init(void) { + int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -322,7 +323,9 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - init_user_ns.max_user_namespaces = max_threads/2; + for (i = 0; i < UCOUNT_COUNTS; i++) { + init_user_ns.ucount_max[i] = max_threads/2; + } } int __weak arch_dup_task_struct(struct task_struct *dst, diff --git a/kernel/ucount.c b/kernel/ucount.c index 33c418718304..0f9ab3b26185 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -57,16 +57,17 @@ static struct ctl_table_root set_root = { static int zero = 0; static int int_max = INT_MAX; +#define UCOUNT_ENTRY(name) \ + { \ + .procname = name, \ + .maxlen = sizeof(int), \ + .mode = 0644, \ + .proc_handler = proc_dointvec_minmax, \ + .extra1 = &zero, \ + .extra2 = &int_max, \ + } static struct ctl_table user_table[] = { - { - .procname = "max_user_namespaces", - .data = &init_user_ns.max_user_namespaces, - .maxlen = sizeof(init_user_ns.max_user_namespaces), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, - }, + UCOUNT_ENTRY("max_user_namespaces"), { } }; #endif /* CONFIG_SYSCTL */ @@ -78,8 +79,10 @@ bool setup_userns_sysctls(struct user_namespace *ns) setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { - tbl[0].data = &ns->max_user_namespaces; - + int i; + for (i = 0; i < UCOUNT_COUNTS; i++) { + tbl[i].data = &ns->ucount_max[i]; + } ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl); } if (!ns->sysctls) { @@ -172,7 +175,8 @@ static inline bool atomic_inc_below(atomic_t *v, int u) } } -struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) +struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, + enum ucount_type type) { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; @@ -180,31 +184,30 @@ struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) for (iter = ucounts; iter; iter = tns->ucounts) { int max; tns = iter->ns; - max = READ_ONCE(tns->max_user_namespaces); - if (!atomic_inc_below(&iter->user_namespaces, max)) + max = READ_ONCE(tns->ucount_max[type]); + if (!atomic_inc_below(&iter->ucount[type], max)) goto fail; } return ucounts; fail: bad = iter; for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) - atomic_dec(&iter->user_namespaces); + atomic_dec(&iter->ucount[type]); put_ucounts(ucounts); return NULL; } -void dec_user_namespaces(struct ucounts *ucounts) +void dec_ucount(struct ucounts *ucounts, enum ucount_type type) { struct ucounts *iter; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - int dec = atomic_dec_if_positive(&iter->user_namespaces); + int dec = atomic_dec_if_positive(&iter->ucount[type]); WARN_ON_ONCE(dec < 0); } put_ucounts(ucounts); } - static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 58c67e5f851c..0edafe305861 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -31,6 +31,16 @@ static bool new_idmap_permitted(const struct file *file, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); +static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) +{ + return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); +} + +static void dec_user_namespaces(struct ucounts *ucounts) +{ + return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); +} + static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing @@ -64,7 +74,7 @@ int create_user_ns(struct cred *new) kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; - int ret; + int ret, i; ret = -EUSERS; if (parent_ns->level > 32) @@ -110,7 +120,9 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); - ns->max_user_namespaces = INT_MAX; + for (i = 0; i < UCOUNT_COUNTS; i++) { + ns->ucount_max[i] = INT_MAX; + } ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From f333c700c6100b53050980986be922bb21466e29 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 14:08:36 -0500 Subject: pidns: Add a limit on the number of pid namespaces Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 25 +++++++++++++++++++++---- kernel/ucount.c | 1 + 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba137fd15..30a7f3351932 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work) /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 +static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); +} + +static void dec_pid_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); +} + static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; + struct ucounts *ucounts; int i; int err; - if (level > MAX_PID_NS_LEVEL) { - err = -EINVAL; + err = -EINVAL; + if (level > MAX_PID_NS_LEVEL) + goto out; + ucounts = inc_pid_namespaces(user_ns); + if (!ucounts) goto out; - } err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) - goto out; + goto out_dec; ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!ns->pidmap[0].page) @@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + ns->ucounts = ucounts; ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); @@ -129,6 +143,8 @@ out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); +out_dec: + dec_pid_namespaces(ucounts); out: return ERR_PTR(err); } @@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); + dec_pid_namespaces(ns->ucounts); put_user_ns(ns->user_ns); call_rcu(&ns->rcu, delayed_free_pidns); } diff --git a/kernel/ucount.c b/kernel/ucount.c index 0f9ab3b26185..66eca94e4ada 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -68,6 +68,7 @@ static int int_max = INT_MAX; } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), + UCOUNT_ENTRY("max_pid_namespaces"), { } }; #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3 From f7af3d1c03136275b876f58644599b120cf4ffdd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 14:11:25 -0500 Subject: utsns: Add a limit on the number of uts namespaces Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/ucount.c | 1 + kernel/utsname.c | 34 +++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 66eca94e4ada..866850e2eb92 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -69,6 +69,7 @@ static int int_max = INT_MAX; static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), UCOUNT_ENTRY("max_pid_namespaces"), + UCOUNT_ENTRY("max_uts_namespaces"), { } }; #endif /* CONFIG_SYSCTL */ diff --git a/kernel/utsname.c b/kernel/utsname.c index 831ea7108232..f3b0bb4ac3ba 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -17,6 +17,16 @@ #include #include +static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); +} + +static void dec_uts_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); +} + static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; @@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + struct ucounts *ucounts; int err; + err = -ENFILE; + ucounts = inc_uts_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; ns = create_uts_ns(); if (!ns) - return ERR_PTR(-ENOMEM); + goto fail_dec; err = ns_alloc_inum(&ns->ns); - if (err) { - kfree(ns); - return ERR_PTR(err); - } + if (err) + goto fail_free; + ns->ucounts = ucounts; ns->ns.ops = &utsns_operations; down_read(&uts_sem); @@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; + +fail_free: + kfree(ns); +fail_dec: + dec_uts_namespaces(ucounts); +fail: + return ERR_PTR(err); } /* @@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); -- cgit v1.2.3 From aba356616386e6e573a34c6d64ed12443686e5c8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 14:20:23 -0500 Subject: ipcns: Add a limit on the number of ipc namespaces Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/ucount.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 866850e2eb92..fbab75424da6 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -70,6 +70,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), UCOUNT_ENTRY("max_pid_namespaces"), UCOUNT_ENTRY("max_uts_namespaces"), + UCOUNT_ENTRY("max_ipc_namespaces"), { } }; #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3 From d08311dd6fd8444e39710dd2fb97562895aed8fa Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 14:25:30 -0500 Subject: cgroupns: Add a limit on the number of cgroup namespaces Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/cgroup.c | 18 ++++++++++++++++++ kernel/ucount.c | 1 + 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..e9e4427fec46 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6295,6 +6295,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) /* cgroup namespaces */ +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; @@ -6316,6 +6326,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); @@ -6327,6 +6338,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; + struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); @@ -6340,6 +6352,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENFILE); + /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); @@ -6349,10 +6365,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); + dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; diff --git a/kernel/ucount.c b/kernel/ucount.c index fbab75424da6..335cc5d2cdd7 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -71,6 +71,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_pid_namespaces"), UCOUNT_ENTRY("max_uts_namespaces"), UCOUNT_ENTRY("max_ipc_namespaces"), + UCOUNT_ENTRY("max_cgroup_namespaces"), { } }; #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3 From 703286608a220d53584cca5986aad5305eec75ed Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 14:33:23 -0500 Subject: netns: Add a limit on the number of net namespaces Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/ucount.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 335cc5d2cdd7..205f1a07faac 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -71,6 +71,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_pid_namespaces"), UCOUNT_ENTRY("max_uts_namespaces"), UCOUNT_ENTRY("max_ipc_namespaces"), + UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), { } }; -- cgit v1.2.3 From 3f005e7de3db8d0b3f7a1f399aa061dc35b65864 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 26 Jul 2016 18:12:21 +0100 Subject: perf/core: Sched out groups atomically Groups of events are supposed to be scheduled atomically, such that it is possible to derive meaningful ratios between their values. We take great pains to achieve this when scheduling event groups to a PMU in group_sched_in(), calling {start,commit}_txn() (which fall back to perf_pmu_{disable,enable}() if necessary) to provide this guarantee. However we don't mirror this in group_sched_out(), and in some cases events will not be scheduled out atomically. For example, if we disable an event group with PERF_EVENT_IOC_DISABLE, we'll cross-call __perf_event_disable() for the group leader, and will call group_sched_out() without having first disabled the relevant PMU. We will disable/enable the PMU around each pmu->del() call, but between each call the PMU will be enabled and events may count. Avoid this by explicitly disabling and enabling the PMU around event removal in group_sched_out(), mirroring what we do in group_sched_in(). Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1469553141-28314-1-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1903b8f3a705..11f6bbe168ab 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1796,6 +1796,8 @@ group_sched_out(struct perf_event *group_event, struct perf_event *event; int state = group_event->state; + perf_pmu_disable(ctx->pmu); + event_sched_out(group_event, cpuctx, ctx); /* @@ -1804,6 +1806,8 @@ group_sched_out(struct perf_event *group_event, list_for_each_entry(event, &group_event->sibling_list, group_entry) event_sched_out(event, cpuctx, ctx); + perf_pmu_enable(ctx->pmu); + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) cpuctx->exclusive = 0; } -- cgit v1.2.3 From 09e61b4f78498bd9f213b0a536e80b79507ea89f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jul 2016 18:02:43 +0200 Subject: perf/x86/intel: Rework the large PEBS setup code In order to allow optimizing perf_pmu_sched_task() we must ensure perf_sched_cb_{inc,dec}() are no longer called from NMI context; this means that pmu::{start,stop}() can no longer use them. Prepare for this by reworking the whole large PEBS setup code. The current code relied on the cpuc->pebs_enabled state, however since that reflects the current active state as per pmu::{start,stop}() we can no longer rely on this. Introduce two counters: cpuc->n_pebs and cpuc->n_large_pebs which count the total number of PEBS events and the number of PEBS events that have FREERUNNING set, resp.. With this we can tell if the current setup requires a single record interrupt threshold or can use a larger buffer. This also improves the code in that it re-enables the large threshold once the PEBS event that required single record gets removed. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 11f6bbe168ab..57aff715039f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2818,6 +2818,10 @@ void perf_sched_cb_inc(struct pmu *pmu) /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. + * + * This callback is relevant even to per-cpu events; for example multi event + * PEBS requires this to provide PID/TID information. This requires we flush + * all queued PEBS records before we context switch to a new task. */ static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, -- cgit v1.2.3 From e48c178814b4a33f84f62d01f5a601ebd57fbba8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jul 2016 09:18:30 +0200 Subject: perf/core: Optimize perf_pmu_sched_task() For perf record -b, which requires the pmu::sched_task callback the current code is rather expensive: 7.68% sched-pipe [kernel.vmlinux] [k] perf_pmu_sched_task 5.95% sched-pipe [kernel.vmlinux] [k] __switch_to 5.20% sched-pipe [kernel.vmlinux] [k] __intel_pmu_disable_all 3.95% sched-pipe perf [.] worker_thread The problem is that it will iterate all registered PMUs, most of which will not have anything to do. Avoid this by keeping an explicit list of PMUs that have requested the callback. The perf_sched_cb_{inc,dec}() functions already takes the required pmu argument, and now that these functions are no longer called from NMI context we can use them to manage a list. With this patch applied the function doesn't show up in the top 4 anymore (it dropped to 18th place). 6.67% sched-pipe [kernel.vmlinux] [k] __switch_to 6.18% sched-pipe [kernel.vmlinux] [k] __intel_pmu_disable_all 3.92% sched-pipe [kernel.vmlinux] [k] switch_mm_irqs_off 3.71% sched-pipe perf [.] worker_thread Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 57aff715039f..803481cb6cbd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2805,13 +2805,26 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } + void perf_sched_cb_inc(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + this_cpu_inc(perf_sched_cb_usages); } @@ -2829,34 +2842,24 @@ static void perf_pmu_sched_task(struct task_struct *prev, { struct perf_cpu_context *cpuctx; struct pmu *pmu; - unsigned long flags; if (prev == next) return; - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - if (pmu->sched_task) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ - perf_pmu_disable(pmu); + if (WARN_ON_ONCE(!pmu->sched_task)) + continue; - pmu->sched_task(cpuctx->task_ctx, sched_in); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - perf_pmu_enable(pmu); + pmu->sched_task(cpuctx->task_ctx, sched_in); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } - - rcu_read_unlock(); - - local_irq_restore(flags); } static void perf_event_switch(struct task_struct *task, @@ -10393,6 +10396,8 @@ static void __init perf_event_init_all_cpus(void) INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); + + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } -- cgit v1.2.3 From 31851a9874d63dbb532910a86b2be49c15997ea3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 5 Aug 2016 14:31:29 +0800 Subject: sched/fair: Remove 'cpu_busy' parameter from update_next_balance() The update_next_balance() function is only used by idle balancing, so its 'cpu_busy' parameter is always 0. Open code it instead of passing it around. Signed-off-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1470378689-14892-1-git-send-email-leo.yan@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..d3005364fb03 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7704,11 +7704,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) } static inline void -update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +update_next_balance(struct sched_domain *sd, unsigned long *next_balance) { unsigned long interval, next; - interval = get_sd_balance_interval(sd, cpu_busy); + /* used by idle balance, so cpu_busy = 0 */ + interval = get_sd_balance_interval(sd, 0); next = sd->last_balance + interval; if (time_after(*next_balance, next)) @@ -7738,7 +7739,7 @@ static int idle_balance(struct rq *this_rq) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); goto out; @@ -7756,7 +7757,7 @@ static int idle_balance(struct rq *this_rq) continue; if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); break; } @@ -7774,7 +7775,7 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); /* * Stop searching for tasks to pull if there are -- cgit v1.2.3 From a1fd46565bea62840a24bee7b7c60f65bb12bd21 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 5 Aug 2016 14:32:38 +0800 Subject: sched/core: Fix one typo Fix one minor typo in the comment: s/targer/target/. Signed-off-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1470378758-15066-1-git-send-email-leo.yan@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a906f20fba7..4a5f52e79c77 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1265,7 +1265,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) /* * Task isn't running anymore; make it appear like we migrated * it before it went to sleep. This means on wakeup we make the - * previous cpu our targer instead of where it really is. + * previous cpu our target instead of where it really is. */ p->wake_cpu = cpu; } -- cgit v1.2.3 From 7c3edd2c300b7ef2005a69dc727692ee07434aa5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Jul 2016 10:56:25 +0200 Subject: sched/fair: Improve PELT stuff some more Vincent noted that the update_tg_load_avg() usage in commit: 3d30544f0212 ("sched/fair: Apply more PELT fixes") isn't entirely sufficient. We need to call this function every time cfs_rq->avg.load changes, this includes when update_cfs_rq_load_avg() returns true, but {attach,detach}_entity_load_avg() themselves also change it. This means we need to unconditionally call update_tg_load_avg(). Also, add more comments. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d3005364fb03..9f9a4e5bbfa9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -726,7 +726,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -759,10 +758,9 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } #else /* !CONFIG_SMP */ @@ -2803,9 +2801,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -2931,10 +2941,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed utilization. It is expected - * that one calls update_tg_load_avg() on this condition, but after you've - * modified the cfs_rq avg (attach/detach), such that we propagate the new - * avg up. + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -8442,7 +8452,6 @@ static void detach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (!vruntime_normalized(p)) { /* @@ -8454,10 +8463,9 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) @@ -8465,7 +8473,6 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8476,10 +8483,9 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; -- cgit v1.2.3 From 772bd008cd9a1d4e8ce566f2edcc61d1c28fcbe5 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:13 +0100 Subject: sched/fair: Make the use of prev_cpu consistent in the wakeup path In commit: ac66f5477239 ("sched/numa: Introduce migrate_swap()") select_task_rq() got a 'cpu' argument to enable overriding of prev_cpu in special cases (NUMA task swapping). However, the select_task_rq_fair() helper functions: wake_affine() and select_idle_sibling(), still use task_cpu(p) directly to work out prev_cpu, which leads to inconsistencies. This patch passes prev_cpu (potentially overridden by NUMA code) into the helper functions to ensure prev_cpu is indeed the same CPU everywhere in the wakeup path. cc: Ingo Molnar cc: Rik van Riel Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9f9a4e5bbfa9..d819da68857f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -656,7 +656,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -1512,7 +1512,8 @@ balance: * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); assign: task_numa_assign(env, cur, imp); @@ -5101,18 +5102,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -5277,11 +5278,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int i = task_cpu(p); if (idle_cpu(target)) return target; @@ -5289,8 +5289,8 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + return prev; /* * Otherwise, iterate the domains and find an eligible idle cpu. @@ -5311,6 +5311,8 @@ static int select_idle_sibling(struct task_struct *p, int target) for_each_lower_domain(sd) { sg = sd->groups; do { + int i; + if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; @@ -5419,13 +5421,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else while (sd) { struct sched_group *group; -- cgit v1.2.3 From eaecf41f5abf80b63c8e025fcb9ee4aa203c3038 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:14 +0100 Subject: sched/fair: Optimize find_idlest_cpu() when there is no choice In the current find_idlest_group()/find_idlest_cpu() search we end up calling find_idlest_cpu() in a sched_group containing only one CPU in the end. Checking idle-states becomes pointless when there is no alternative, so bail out instead. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d819da68857f..acdc351d2386 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5239,6 +5239,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { if (idle_cpu(i)) { -- cgit v1.2.3 From 9279e0d2e565e0217618c2087de83d3239811329 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Sun, 10 Jul 2016 15:00:26 +0100 Subject: sched/core: Add documentation for 'cookie' argument Add documentation for the cookie argument in try_to_wake_up_local(). This caused the following warning when building documentation: kernel/sched/core.c:2088: warning: No description found for parameter 'cookie' Signed-off-by: Luis de Bethencourt Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Fixes: e7904a28f533 ("ilocking/lockdep, sched/core: Implement a better lock pinning scheme") Link: http://lkml.kernel.org/r/1468159226-17674-1-git-send-email-luisbg@osg.samsung.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a5f52e79c77..10f2595c408a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2073,6 +2073,7 @@ out: /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened + * @cookie: context's cookie for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not -- cgit v1.2.3 From 98b0a857805080db04f50b8c71438c9c369ef0b3 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 5 Aug 2016 16:07:55 +0100 Subject: sched/deadline: Remove useless parameter from setup_new_dl_entity() setup_new_dl_entity() takes two parameters, but it only actually uses one of them, under a different name, to setup a new dl_entity, after: 2f9f3fdc928 "sched/deadline: Remove dl_new from struct sched_dl_entity" as we currently do: setup_new_dl_entity(&p->dl, &p->dl) However, before Luca's change we were doing: setup_new_dl_entity(dl_se, pi_se) in update_dl_entity() for a dl_se->new entity: we were using pi_se's parameters (the potential PI donor) for setting up a new entity. This change removes the useless second parameter of setup_new_dl_entity(). While we are at it we also optimize things further calling setup_new_dl_ entity() only for already queued tasks, since (as pointed out by Xunlei) we already do the very same update at tasks wakeup time anyway. By doing so, we don't need to worry about a potential PI donor anymore, as rt_mutex_setprio() takes care of that already for us. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Xunlei Pang Link: http://lkml.kernel.org/r/1470409675-20935-1-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1ce8867283dc..d091f4a95416 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -346,12 +346,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, * one, and to (try to!) reconcile itself with its own scheduling * parameters. */ -static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + WARN_ON(dl_se->dl_boosted); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -367,8 +367,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; + dl_se->runtime = dl_se->dl_runtime; } /* @@ -1723,10 +1723,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + + /* If p is not queued we will update its parameters at next wakeup. */ + if (!task_on_rq_queued(p)) + return; + + /* + * If p is boosted we already updated its params in + * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), + * p's deadline being now already after rq_clock(rq). + */ if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(&p->dl, &p->dl); + setup_new_dl_entity(&p->dl); - if (task_on_rq_queued(p) && rq->curr != p) { + if (rq->curr != p) { #ifdef CONFIG_SMP if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); -- cgit v1.2.3 From 64a5e3cb308028dba0676daae0a7821d770036fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2016 14:26:11 +0200 Subject: locking/qspinlock: Improve readability Restructure pv_queued_spin_steal_lock() as I found it hard to read. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long --- kernel/locking/qspinlock_paravirt.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8a99abf58080..429c3dc2a5f3 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -70,11 +70,14 @@ struct pv_node { static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; - int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); - qstat_inc(qstat_pv_lock_stealing, ret); - return ret; + if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + qstat_inc(qstat_pv_lock_stealing, true); + return true; + } + + return false; } /* @@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) static inline bool pv_wait_early(struct pv_node *prev, int loop) { - if ((loop & PV_PREV_CHECK_MASK) != 0) return false; -- cgit v1.2.3 From 08be8f63c40c030b5cf95b4368e314e563a86301 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 31 May 2016 12:53:47 -0400 Subject: locking/pvstat: Separate wait_again and spurious wakeup stats Currently there are overlap in the pvqspinlock wait_again and spurious_wakeup stat counters. Because of lock stealing, it is no longer possible to accurately determine if spurious wakeup has happened in the queue head. As they track both the queue node and queue head status, it is also hard to tell how many of those comes from the queue head and how many from the queue node. This patch changes the accounting rules so that spurious wakeup is only tracked in the queue node. The wait_again count, however, is only tracked in the queue head when the vCPU failed to acquire the lock after a vCPU kick. This should give a much better indication of the wait-kick dynamics in the queue node and the queue head. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Douglas Hatch Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464713631-1066-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 12 +++--------- kernel/locking/qspinlock_stat.h | 4 ++-- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 429c3dc2a5f3..3acf16d79cf4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -288,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; struct pv_node *pp = (struct pv_node *)prev; - int waitcnt = 0; int loop; bool wait_early; - /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ - for (;; waitcnt++) { + for (;;) { for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; @@ -317,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) if (!READ_ONCE(node->locked)) { qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_again, waitcnt); qstat_inc(qstat_pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -458,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) pv_wait(&l->locked, _Q_SLOW_VAL); /* - * The unlocker should have freed the lock before kicking the - * CPU. So if the lock is still not free, it is a spurious - * wakeup or another vCPU has stolen the lock. The current - * vCPU should spin again. + * Because of lock stealing, the queue head vCPU may not be + * able to acquire the lock before it has to wait again. */ - qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index b9d031516254..eb0a599fcf58 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -24,8 +24,8 @@ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups - * pv_wait_again - # of vCPU wait's that happened after a vCPU kick + * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs + * pv_wait_again - # of wait's after a queue head vCPU kick * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node -- cgit v1.2.3 From 80127a39681bd68c959f0953f84a830cbd7c3b1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2016 20:08:46 +0200 Subject: locking/percpu-rwsem: Optimize readers and reduce global impact Currently the percpu-rwsem switches to (global) atomic ops while a writer is waiting; which could be quite a while and slows down releasing the readers. This patch cures this problem by ordering the reader-state vs reader-count (see the comments in __percpu_down_read() and percpu_down_write()). This changes a global atomic op into a full memory barrier, which doesn't have the global cacheline contention. This also enables using the percpu-rwsem with rcu_sync disabled in order to bias the implementation differently, reducing the writer latency by adding some cost to readers. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org [ Fixed modular build. ] Signed-off-by: Ingo Molnar --- kernel/locking/percpu-rwsem.c | 228 ++++++++++++++++++++++++------------------ kernel/rcu/sync.c | 2 + 2 files changed, 133 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index bec0b647f9cc..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -8,152 +8,186 @@ #include #include -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) + sem->read_count = alloc_percpu(int); + if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); + rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + __init_rwsem(&sem->rw_sem, name, rwsem_key); + init_waitqueue_head(&sem->writer); + sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ - if (!brw->fast_read_ctr) + if (!sem->read_count) return; - rcu_sync_dtor(&brw->rss); - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ + rcu_sync_dtor(&sem->rss); + free_percpu(sem->read_count); + sem->read_count = NULL; /* catch use after free bugs */ } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -/* - * This is the fast-path for down_read/up_read. If it succeeds we rely - * on the barriers provided by rcu_sync_enter/exit; see the comments in - * percpu_down_write() and percpu_up_write(). - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { - bool success; + /* + * Due to having preemption disabled the decrement happens on + * the same CPU as the increment, avoiding the + * increment-on-one-CPU-and-decrement-on-another problem. + * + * If the reader misses the writer's assignment of readers_block, then + * the writer is guaranteed to see the reader's increment. + * + * Conversely, any readers that increment their sem->read_count after + * the writer looks are guaranteed to see the readers_block value, + * which in turn means that they are guaranteed to immediately + * decrement their sem->read_count, so that it doesn't matter that the + * writer missed them. + */ - preempt_disable(); - success = rcu_sync_is_idle(&brw->rss); - if (likely(success)) - __this_cpu_add(*brw->fast_read_ctr, val); - preempt_enable(); + smp_mb(); /* A matches D */ - return success; -} + /* + * If !readers_block the critical section starts here, matched by the + * release in percpu_up_write(). + */ + if (likely(!smp_load_acquire(&sem->readers_block))) + return 1; -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + /* + * Per the above comment; we still have preemption disabled and + * will thus decrement on the same CPU as we incremented. + */ + __percpu_up_read(sem); - if (likely(update_fast_ctr(brw, +1))) - return; + if (try) + return 0; - /* Avoid rwsem_acquire_read() and rwsem_release() */ - __down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); -} -EXPORT_SYMBOL_GPL(percpu_down_read); + /* + * We either call schedule() in the wait, or we'll fall through + * and reschedule on the preempt_enable() in percpu_down_read(). + */ + preempt_enable_no_resched(); -int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) -{ - if (unlikely(!update_fast_ctr(brw, +1))) { - if (!__down_read_trylock(&brw->rw_sem)) - return 0; - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); - } - - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + /* + * Avoid lockdep for the down/up_read() we already have them. + */ + __down_read(&sem->rw_sem); + this_cpu_inc(*sem->read_count); + __up_read(&sem->rw_sem); + + preempt_disable(); return 1; } +EXPORT_SYMBOL_GPL(__percpu_down_read); -void percpu_up_read(struct percpu_rw_semaphore *brw) +void __percpu_up_read(struct percpu_rw_semaphore *sem) { - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; + smp_mb(); /* B matches C */ + /* + * In other words, if they see our decrement (presumably to aggregate + * zero, as that is the only time it matters) they will also see our + * critical section. + */ + __this_cpu_dec(*sem->read_count); - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); + /* Prod writer to recheck readers_active */ + wake_up(&sem->writer); } -EXPORT_SYMBOL_GPL(percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_up_read); + +#define per_cpu_sum(var) \ +({ \ + typeof(var) __sum = 0; \ + int cpu; \ + compiletime_assert_atomic_type(__sum); \ + for_each_possible_cpu(cpu) \ + __sum += per_cpu(var, cpu); \ + __sum; \ +}) -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +/* + * Return true if the modular sum of the sem->read_count per-CPU variable is + * zero. If this sum is zero, then it is stable due to the fact that if any + * newly arriving readers increment a given counter, they will immediately + * decrement that same counter. + */ +static bool readers_active_check(struct percpu_rw_semaphore *sem) { - unsigned int sum = 0; - int cpu; + if (per_cpu_sum(*sem->read_count) != 0) + return false; + + /* + * If we observed the decrement; ensure we see the entire critical + * section. + */ - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } + smp_mb(); /* C matches B */ - return sum; + return true; } -void percpu_down_write(struct percpu_rw_semaphore *brw) +void percpu_down_write(struct percpu_rw_semaphore *sem) { + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + + down_write(&sem->rw_sem); + /* - * Make rcu_sync_is_idle() == F and thus disable the fast-path in - * percpu_down_read() and percpu_up_read(), and wait for gp pass. - * - * The latter synchronises us with the preceding readers which used - * the fast-past, so we can not miss the result of __this_cpu_add() - * or anything else inside their criticial sections. + * Notify new readers to block; up until now, and thus throughout the + * longish rcu_sync_enter() above, new readers could still come in. */ - rcu_sync_enter(&brw->rss); + WRITE_ONCE(sem->readers_block, 1); - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); + smp_mb(); /* D matches A */ - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + /* + * If they don't see our writer of readers_block, then we are + * guaranteed to see their sem->read_count increment, and therefore + * will wait for them. + */ - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); + /* Wait for all now active readers to complete. */ + wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); -void percpu_up_write(struct percpu_rw_semaphore *brw) +void percpu_up_write(struct percpu_rw_semaphore *sem) { - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); /* - * Enable the fast-path in percpu_down_read() and percpu_up_read() - * but only after another gp pass; this adds the necessary barrier - * to ensure the reader can't miss the changes done by us. + * Signal the writer is done, no fast path yet. + * + * One reason that we cannot just immediately flip to readers_fast is + * that new readers might fail to see the results of this writer's + * critical section. + * + * Therefore we force it through the slow path which guarantees an + * acquire and thereby guarantees the critical section's consistency. + */ + smp_store_release(&sem->readers_block, 0); + + /* + * Release the write lock, this will allow readers back in the game. + */ + up_write(&sem->rw_sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the + * reader fast path will be available again. Safe to use outside the + * exclusive write lock because its counting. */ - rcu_sync_exit(&brw->rss); + rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be922c9f3d37..198473d90f81 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp) RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), "suspicious rcu_sync_is_idle() usage"); } + +EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); #endif /** -- cgit v1.2.3 From aa877175e7a9982233ed8f10cb4bfddd78d82741 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 3 Aug 2016 13:22:28 -0400 Subject: cpu/hotplug: Prevent alloc/free of irq descriptors during CPU up/down (again) Now that Xen no longer allocates irqs in _cpu_up() we can restore commit: a89941816726 ("hotplug: Prevent alloc/free of irq descriptors during cpu up/down") Signed-off-by: Boris Ostrovsky Reviewed-by: Juergen Gross Acked-by: Thomas Gleixner Cc: Anna-Maria Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: david.vrabel@citrix.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1470244948-17674-3-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 341bf80f80bd..ec12b726fa6f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -349,8 +349,16 @@ static int bringup_cpu(unsigned int cpu) struct task_struct *idle = idle_thread_get(cpu); int ret; + /* + * Some architectures have to walk the irq descriptors to + * setup the vector space for the cpu which comes online. + * Prevent irq alloc/free across the bringup. + */ + irq_lock_sparse(); + /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); + irq_unlock_sparse(); if (ret) { cpu_notify(CPU_UP_CANCELED, cpu); return ret; -- cgit v1.2.3 From d1c6d149cf04d6c7c3c3ebf4b66c82500cbcf6e1 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 23 Jul 2016 09:46:39 +0200 Subject: sched/debug: Make the "Preemption disabled at ..." message more useful This message is currently really useless since it always prints a value that comes from the printk() we just did, e.g.: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 31996, name: trinity-c1 Preemption disabled at:[] down_trylock+0x13/0x80 BUG: sleeping function called from invalid context at include/linux/freezer.h:56 in_atomic(): 0, irqs_disabled(): 0, pid: 31996, name: trinity-c1 Preemption disabled at:[] console_unlock+0x2f7/0x930 Here, both down_trylock() and console_unlock() is somewhere in the printk() path. We should save the value before calling printk() and use the saved value instead. That immediately reveals the offending callsite: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 14971, name: trinity-c2 Preemption disabled at:[] rhashtable_walk_start+0x46/0x150 Bug report: http://marc.info/?l=linux-netdev&m=146925979821849&w=2 Signed-off-by: Vegard Nossum Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rusty Russel Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 10f2595c408a..a65681605aef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3171,6 +3171,9 @@ static inline void preempt_latency_stop(int val) { } */ static noinline void __schedule_bug(struct task_struct *prev) { + /* Save this before calling printk(), since that will clobber it */ + unsigned long preempt_disable_ip = get_preempt_disable_ip(current); + if (oops_in_progress) return; @@ -3181,13 +3184,12 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif if (panic_on_warn) panic("scheduling while atomic\n"); @@ -7571,6 +7573,7 @@ EXPORT_SYMBOL(__might_sleep); void ___might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + unsigned long preempt_disable_ip; rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && @@ -7581,6 +7584,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; + /* Save this before calling printk(), since that will clobber it */ + preempt_disable_ip = get_preempt_disable_ip(current); + printk(KERN_ERR "BUG: sleeping function called from invalid context at %s:%d\n", file, line); @@ -7595,13 +7601,12 @@ void ___might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif dump_stack(); } EXPORT_SYMBOL(___might_sleep); -- cgit v1.2.3 From f0b22e39e3409109d40ef036b1f46b419e82f58e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 22 Jul 2016 21:46:02 +0200 Subject: sched/debug: Add taint on "BUG: Sleeping function called from invalid context" Seeing this, it occurs to me that we should probably add a taint here: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 32211, name: trinity-c3 Preemption disabled at:[] console_unlock+0x2f7/0x930 CPU: 3 PID: 32211 Comm: trinity-c3 Not tainted 4.7.0-rc7+ #19 ^^^^^^^^^^^ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 0000000000000000 ffff8800b8a17160 ffffffff81971441 ffff88011a3c4c80 ffff88011a3c4c80 ffff8800b8a17198 ffffffff81158067 0000000000000de6 ffff88011a3c4c80 ffffffff8390e07c 0000000000000184 0000000000000000 Call Trace: [...] BUG: sleeping function called from invalid context at arch/x86/mm/fault.c:1309 in_atomic(): 0, irqs_disabled(): 0, pid: 32211, name: trinity-c3 Preemption disabled at:[] down_trylock+0x13/0x80 CPU: 3 PID: 32211 Comm: trinity-c3 Not tainted 4.7.0-rc7+ #19 ^^^^^^^^^^^ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 0000000000000000 ffff8800b8a17e08 ffffffff81971441 ffff88011a3c4c80 ffff88011a3c4c80 ffff8800b8a17e40 ffffffff81158067 0000000000000000 ffff88011a3c4c80 ffffffff83437b20 000000000000051d 0000000000000000 Call Trace: [...] Signed-off-by: Vegard Nossum Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rusty Russel Link: http://lkml.kernel.org/r/1469216762-19626-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a65681605aef..3b6b23c57418 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7608,6 +7608,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) pr_cont("\n"); } dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL(___might_sleep); #endif -- cgit v1.2.3 From 4c737b41de7f4eef2a593803bad1b918dd718b10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: cgroup: make cgroup_path() and friends behave in the style of strlcpy() cgroup_path() and friends used to format the path from the end and thus the resulting path usually didn't start at the start of the passed in buffer. Also, when the buffer was too small, the partial result was truncated from the head rather than tail and there was no way to tell how long the full path would be. These make the functions less robust and more awkward to use. With recent updates to kernfs_path(), cgroup_path() and friends can be made to behave in strlcpy() style. * cgroup_path(), cgroup_path_ns[_locked]() and task_cgroup_path() now always return the length of the full path. If buffer is too small, it contains nul terminated truncated output. * All users updated accordingly. v2: cgroup_path() usage in kernel/sched/debug.c converted. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: Serge Hallyn Cc: Peter Zijlstra --- kernel/cgroup.c | 48 ++++++++++++++++++++++-------------------------- kernel/cpuset.c | 12 ++++++------ kernel/sched/debug.c | 3 ++- 3 files changed, 30 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..3861161e460f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2315,22 +2315,18 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); - int ret; - ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; + return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } -char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { - char *ret; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2357,12 +2353,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * * Return value is the same as kernfs_path(). */ -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; - char *path = NULL; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2371,16 +2367,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - if (strlcpy(buf, "/", buflen) < buflen) - path = buf; + ret = strlcpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - return path; + return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -5716,7 +5711,7 @@ core_initcall(cgroup_wq_init); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *path; + char *buf; int retval; struct cgroup_root *root; @@ -5759,18 +5754,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (!path) { + if (retval >= PATH_MAX) { retval = -ENAMETOOLONG; goto out_unlock; } + + seq_puts(m, buf); } else { - path = "/"; + seq_puts(m, "/"); } - seq_puts(m, path); - if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else @@ -6035,8 +6030,9 @@ static void cgroup_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL, *path; + char *pathbuf = NULL, *agentbuf = NULL; char *argv[3], *envp[3]; + int ret; mutex_lock(&cgroup_mutex); @@ -6046,13 +6042,13 @@ static void cgroup_release_agent(struct work_struct *work) goto out; spin_lock_irq(&css_set_lock); - path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (!path) + if (ret >= PATH_MAX) goto out; argv[0] = agentbuf; - argv[1] = path; + argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c7fd2778ed50..793ae6fd96dc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2689,7 +2689,7 @@ void __cpuset_memory_pressure_bump(void) int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *p; + char *buf; struct cgroup_subsys_state *css; int retval; @@ -2700,18 +2700,18 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); - p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); + retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); css_put(css); - if (!p) + if (retval >= PATH_MAX) goto out_free; - seq_puts(m, p); + seq_puts(m, buf); seq_putc(m, '\n'); retval = 0; out_free: kfree(buf); out: - return retval; + return 0; } #endif /* CONFIG_PROC_PID_CPUSET */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..23cb609ba4eb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -410,7 +410,8 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif -- cgit v1.2.3 From ed1777de25e45bfb58fad63341904f8a77911785 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: cgroup: add tracepoints for basic operations Debugging what goes wrong with cgroup setup can get hairy. Add tracepoints for cgroup hierarchy mount, cgroup creation/destruction and task migration operations for better visibility. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3861161e460f..5e2e81ad9175 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -64,6 +64,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + trace_cgroup_destroy_root(root); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); @@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); spin_unlock(&release_agent_path_lock); } + + trace_cgroup_remount(root); + out_unlock: kfree(opts.release_agent); kfree(opts.name); @@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; + trace_cgroup_setup_root(root); + /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in @@ -2825,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); + + if (!ret) + trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + return ret; } @@ -3587,6 +3601,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); mutex_unlock(&cgroup_mutex); @@ -4355,6 +4371,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (task) { ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); @@ -5020,6 +5038,8 @@ static void css_release_work_fn(struct work_struct *work) ss->css_released(css); } else { /* cgroup release path */ + trace_cgroup_release(cgrp); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5306,6 +5326,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; + trace_cgroup_mkdir(cgrp); + /* let's create and online css's */ kernfs_activate(kn); @@ -5481,6 +5503,9 @@ static int cgroup_rmdir(struct kernfs_node *kn) ret = cgroup_destroy_locked(cgrp); + if (!ret) + trace_cgroup_rmdir(cgrp); + cgroup_kn_unlock(kn); return ret; } -- cgit v1.2.3 From 60d20f9195b260bdf0ac10c275ae9f6016f9c069 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Fri, 12 Aug 2016 08:56:52 -0700 Subject: bpf: Add bpf_current_task_under_cgroup helper This adds a bpf helper that's similar to the skb_in_cgroup helper to check whether the probe is currently executing in the context of a specific subset of the cgroupsv2 hierarchy. It does this based on membership test for a cgroup arraymap. It is invalid to call this in an interrupt, and it'll return an error. The helper is primarily to be used in debugging activities for containers, where you may have multiple programs running in a given top-level "container". Signed-off-by: Sargun Dhillon Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Tejun Heo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/verifier.c | 4 +++- kernel/trace/bpf_trace.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633a650d7aeb..a2ac051c342f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void) } late_initcall(register_perf_event_array_map); -#ifdef CONFIG_SOCK_CGROUP_DATA +#ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int fd) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7094c69ac199..d504722ebfa4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1053,7 +1053,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - if (func_id != BPF_FUNC_skb_in_cgroup) + if (func_id != BPF_FUNC_skb_in_cgroup && + func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; default: @@ -1075,6 +1076,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; + case BPF_FUNC_current_task_under_cgroup: case BPF_FUNC_skb_in_cgroup: if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b20438fdb029..6b794d6669a7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -376,6 +376,34 @@ static const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; +static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *)(long)r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + u32 idx = (u32)r2; + + if (unlikely(in_interrupt())) + return -EINVAL; + + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return task_under_cgroup_hierarchy(current, cgrp); +} + +static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { + .func = bpf_current_task_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -407,6 +435,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_read_proto; case BPF_FUNC_probe_write_user: return bpf_get_probe_write_proto(); + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; default: return NULL; } -- cgit v1.2.3 From 6841de8b0d03cc9a4e0e928453623c13ee754f77 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 11 Aug 2016 18:17:16 -0700 Subject: bpf: allow helpers access the packet directly The helper functions like bpf_map_lookup_elem(map, key) were only allowing 'key' to point to the initialized stack area. That is causing performance degradation when programs need to process millions of packets per second and need to copy contents of the packet into the stack just to pass the stack pointer into the lookup() function. Allow such helpers read from the packet directly. All helpers that expect ARG_PTR_TO_MAP_KEY, ARG_PTR_TO_MAP_VALUE, ARG_PTR_TO_STACK assume byte aligned pointer, so no alignment concerns, only need to check that helper will not be accessing beyond the packet range verified by the prior 'if (ptr < data_end)' condition. For now allow this feature for XDP programs only. Later it can be relaxed for the clsact programs as well. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 61 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d504722ebfa4..0cc46f1df358 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -930,14 +930,14 @@ static int check_func_arg(struct verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct reg_state *reg = env->cur_state.regs + regno; - enum bpf_reg_type expected_type; + struct reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + enum bpf_reg_type expected_type, type = reg->type; int err = 0; if (arg_type == ARG_DONTCARE) return 0; - if (reg->type == NOT_INIT) { + if (type == NOT_INIT) { verbose("R%d !read_ok\n", regno); return -EACCES; } @@ -950,16 +950,29 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } + if (type == PTR_TO_PACKET && !may_write_pkt_data(env->prog->type)) { + verbose("helper access to the packet is not allowed for clsact\n"); + return -EACCES; + } + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; + if (type != PTR_TO_PACKET && type != expected_type) + goto err_type; } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_RAW_STACK) { expected_type = PTR_TO_STACK; @@ -967,20 +980,16 @@ static int check_func_arg(struct verifier_env *env, u32 regno, * passed in as argument, it's a CONST_IMM type. Final test * happens during stack boundary checking. */ - if (reg->type == CONST_IMM && reg->imm == 0) - expected_type = CONST_IMM; + if (type == CONST_IMM && reg->imm == 0) + /* final test in check_stack_boundary() */; + else if (type != PTR_TO_PACKET && type != expected_type) + goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; } - if (reg->type != expected_type) { - verbose("R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], reg_type_str[expected_type]); - return -EACCES; - } - if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ meta->map_ptr = reg->map_ptr; @@ -998,8 +1007,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, meta->map_ptr->key_size, - false, NULL); + if (type == PTR_TO_PACKET) + err = check_packet_access(env, regno, 0, + meta->map_ptr->key_size); + else + err = check_stack_boundary(env, regno, + meta->map_ptr->key_size, + false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -1009,9 +1023,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, - meta->map_ptr->value_size, - false, NULL); + if (type == PTR_TO_PACKET) + err = check_packet_access(env, regno, 0, + meta->map_ptr->value_size); + else + err = check_stack_boundary(env, regno, + meta->map_ptr->value_size, + false, NULL); } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); @@ -1025,11 +1043,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed, meta); + if (regs[regno - 1].type == PTR_TO_PACKET) + err = check_packet_access(env, regno - 1, 0, reg->imm); + else + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed, meta); } return err; +err_type: + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[type], reg_type_str[expected_type]); + return -EACCES; } static int check_map_func_compatibility(struct bpf_map *map, int func_id) -- cgit v1.2.3 From 8937bd80fce64a25be23c7790459d93f7b1e9b79 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 11 Aug 2016 18:17:18 -0700 Subject: bpf: allow bpf_get_prandom_u32() to be used in tracing bpf_get_prandom_u32() was initially introduced for socket filters and later requested numberous times to be added to tracing bpf programs for the same reason as in socket filters: to be able to randomly select incoming events. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6b794d6669a7..ad35213b8405 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -437,6 +437,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return bpf_get_probe_write_proto(); case BPF_FUNC_current_task_under_cgroup: return &bpf_current_task_under_cgroup_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; default: return NULL; } -- cgit v1.2.3 From bdfaa2eecd5f6ca0cb5cff2bc7a974a15a2fd21b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 17 Aug 2016 17:37:04 +0200 Subject: uprobes: Rename the "struct page *" args of __replace_page() Purely cosmetic, no changes in the compiled code. Perhaps it is just me but I can hardly read __replace_page() because I can't distinguish "page" from "kpage" and because I need to look at the caller to to ensure that, say, kpage is really the new page and the code is correct. Rename them to old_page and new_page, this matches the caller. Signed-off-by: Oleg Nesterov Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Brenden Blanco Cc: Jiri Olsa Cc: Johannes Weiner Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/20160817153704.GC29724@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8c50276b60d1..d4129bb05e5d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * Returns 0 on success, -EFAULT on failure. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page, struct page *kpage) + struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; @@ -161,49 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; - err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); if (err) return err; /* For try_to_free_swap() and munlock_vma_page() below */ - lock_page(page); + lock_page(old_page); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; - ptep = page_check_address(page, mm, addr, &ptl, 0); + ptep = page_check_address(old_page, mm, addr, &ptl, 0); if (!ptep) { - mem_cgroup_cancel_charge(kpage, memcg, false); + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } - get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr, false); - mem_cgroup_commit_charge(kpage, memcg, false, false); - lru_cache_add_active_or_unevictable(kpage, vma); + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); - if (!PageAnon(page)) { - dec_mm_counter(mm, mm_counter_file(page)); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(page, false); - if (!page_mapped(page)) - try_to_free_swap(page); + page_remove_rmap(old_page, false); + if (!page_mapped(old_page)) + try_to_free_swap(old_page); pte_unmap_unlock(ptep, ptl); if (vma->vm_flags & VM_LOCKED) - munlock_vma_page(page); - put_page(page); + munlock_vma_page(old_page); + put_page(old_page); err = 0; unlock: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - unlock_page(page); + unlock_page(old_page); return err; } -- cgit v1.2.3 From 29dd3288705f26cc27663e79061209dabce2d5b9 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Wed, 17 Aug 2016 15:06:08 +0530 Subject: bitmap.h, perf/core: Fix the mask in perf_output_sample_regs() When decoding the perf_regs mask in perf_output_sample_regs(), we loop through the mask using find_first_bit and find_next_bit functions. While the exisiting code works fine in most of the case, the logic is broken for big-endian 32-bit kernels. When reading a u64 mask using (u32 *)(&val)[0], find_*_bit() assumes that it gets the lower 32 bits of u64, but instead it gets the upper 32 bits - which is wrong. The fix is to swap the words of the u64 to handle this case. This is _not_ a regular endianness swap. Suggested-by: Yury Norov Signed-off-by: Madhavan Srinivasan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yury Norov Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1471426568-31051-2-git-send-email-maddy@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ca4fde5ed268..849919c2f3d7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5340,9 +5340,10 @@ perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; + DECLARE_BITMAP(_mask, 64); - for_each_set_bit(bit, (const unsigned long *) &mask, - sizeof(mask) * BITS_PER_BYTE) { + bitmap_from_u64(_mask, mask); + for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); -- cgit v1.2.3 From 4ff6a8debf48a7bf48e93c01da720785070d3a25 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Wed, 17 Aug 2016 13:55:05 -0700 Subject: perf/core: Generalize event->group_flags Currently, PERF_GROUP_SOFTWARE is used in the group_flags field of a group's leader to indicate that is_software_event(event) is true for all events in a group. This is the only usage of event->group_flags. This pattern of setting a group level flags when all events in the group share a property is useful for the flag introduced in the next patch and for future CQM/CMT flags. So this patches generalizes group_flags to work as an aggregate of event level flags. PERF_GROUP_SOFTWARE denotes an inmutable event's property. All other flags that I intend to add are also determinable at event initialization. To better convey the above, this patch renames event's group_flags to group_caps and PERF_GROUP_SOFTWARE to PERF_EV_CAP_SOFTWARE. Individual event flags are stored in the new event->event_caps. Since the cap flags do not change after event initialization, there is no need to serialize event_caps. This new field is used when events are added to a context, similarly to how PERF_GROUP_SOFTWARE and is_software_event() worked. Lastly, for consistency, updates is_software_event() to rely in event_cap instead of the context index. Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1471467307-61171-3-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 849919c2f3d7..8c42a5ae9030 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1475,8 +1475,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) { struct list_head *list; - if (is_software_event(event)) - event->group_flags |= PERF_GROUP_SOFTWARE; + event->group_caps = event->event_caps; list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); @@ -1630,9 +1629,7 @@ static void perf_group_attach(struct perf_event *event) WARN_ON_ONCE(group_leader->ctx != event->ctx); - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + group_leader->group_caps &= event->event_caps; list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; @@ -1723,7 +1720,7 @@ static void perf_group_detach(struct perf_event *event) sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ - sibling->group_flags = event->group_flags; + sibling->group_caps = event->group_caps; WARN_ON_ONCE(sibling->ctx != event->ctx); } @@ -2149,7 +2146,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (event->group_flags & PERF_GROUP_SOFTWARE) + if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware @@ -9490,6 +9487,9 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -9503,7 +9503,7 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = group_leader->pmu; } else if (is_software_event(group_leader) && - (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to -- cgit v1.2.3 From d6a2f9035bfc27d0e9d78b13635dda9fb017ac01 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Wed, 17 Aug 2016 13:55:06 -0700 Subject: perf/core: Introduce PMU_EV_CAP_READ_ACTIVE_PKG Introduce the flag PMU_EV_CAP_READ_ACTIVE_PKG, useful for uncore events, that allows a PMU to signal the generic perf code that an event is readable in the current CPU if the event is active in a CPU in the same package as the current CPU. This is an optimization that avoids a unnecessary IPI for the common case where uncore events are run and read in the same package but in different CPUs. As an example, the IPI removal speeds up perf_read() in my Haswell system as follows: - For event UNC_C_LLC_LOOKUP: From 260 us to 31 us. - For event RAPL's power/energy-cores/: From to 255 us to 27 us. For the optimization to work, all events in the group must have it (similarly to PERF_EV_CAP_SOFTWARE). Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1471467307-61171-4-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8c42a5ae9030..3f07e6cfc1b6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3424,6 +3424,22 @@ struct perf_read_data { int ret; }; +static int find_cpu_to_read(struct perf_event *event, int local_cpu) +{ + int event_cpu = event->oncpu; + u16 local_pkg, event_pkg; + + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { + event_pkg = topology_physical_package_id(event_cpu); + local_pkg = topology_physical_package_id(local_cpu); + + if (event_pkg == local_pkg) + return local_cpu; + } + + return event_cpu; +} + /* * Cross CPU call to read the hardware event */ @@ -3545,7 +3561,7 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0; + int ret = 0, cpu_to_read, local_cpu; /* * If event is enabled and currently active on a CPU, update the @@ -3557,7 +3573,12 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + + local_cpu = get_cpu(); + cpu_to_read = find_cpu_to_read(event, local_cpu); + put_cpu(); + + ret = smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1); /* The event must have been read from an online CPU: */ WARN_ON_ONCE(ret); ret = ret ? : data.ret; -- cgit v1.2.3 From 1fc770d5899c995db8e22d35eb918a2cb79559d9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 15 Aug 2016 12:14:10 -0400 Subject: sched: Remove struct rq::nohz_stamp The nohz_stamp member of struct rq has been unused since 2010, when this commit removed the code that referenced it: 396e894d289d ("sched: Revert nohz_ratelimit() for now") Signed-off-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160815121410.5ea1c98f@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..afe76d04e916 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -597,7 +597,6 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_load_update_tick; #endif /* CONFIG_SMP */ - u64 nohz_stamp; unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL -- cgit v1.2.3 From 94f438c84e850570f28dd36588a0d7f73b991e44 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Aug 2016 12:54:59 +0200 Subject: sched/core: Clarify SD_flags comment The SD_flags comment is very terse and doesn't explain why PACKING is odd. IIRC the distinction is that the 'normal' ones only describe topology, while the ASYM_PACKING one also prescribes behaviour. It is odd in the way that it doesn't only describe things. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/20160815105459.GS6879@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b6b23c57418..54fff8109922 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6355,13 +6355,19 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: * - * Odd one out: - * SD_ASYM_PACKING - describes SMT quirks + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: + * + * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ -- cgit v1.2.3 From 0e6d2a67a41321b3ef650b780a279a37855de08e Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:21 +0100 Subject: sched/core: Remove unnecessary NULL-pointer check Checking if the sched_domain pointer returned by sd_init() is NULL seems pointless as sd_init() neither checks if it is valid to begin with nor set it to NULL. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 54fff8109922..1b2dd5220170 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6854,8 +6854,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, struct sched_domain *child, int cpu) { struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { -- cgit v1.2.3 From 1f6e6c7cb9bcd58abb5ee11243e0eefe6b36fc8e Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:22 +0100 Subject: sched/core: Introduce SD_ASYM_CPUCAPACITY sched_domain topology flag Add a topology flag to the sched_domain hierarchy indicating the lowest domain level where the full range of CPU capacities is represented by the domain members for asymmetric capacity topologies (e.g. ARM big.LITTLE). The flag is intended to indicate that extra care should be taken when placing tasks on CPUs and this level spans all the different types of CPUs found in the system (no need to look further up the domain hierarchy). This information is currently only available through iterating through the capacities of all the CPUs at parent levels in the sched_domain hierarchy. SD 2 [ 0 1 2 3] SD_ASYM_CPUCAPACITY SD 1 [ 0 1] [ 2 3] !SD_ASYM_CPUCAPACITY CPU: 0 1 2 3 capacity: 756 756 1024 1024 If the topology in the example above is duplicated to create an eight CPU example with third sched_domain level on top (SD 3), this level should not have the flag set (!SD_ASYM_CPUCAPACITY) as its two group would both have all CPU capacities represented within them. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-6-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b2dd5220170..46bfb90aec00 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5716,6 +5716,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5746,6 +5747,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | @@ -6363,6 +6365,7 @@ static int sched_domains_curr_level; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -6374,6 +6377,7 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * -- cgit v1.2.3 From 3676b13e8524c576825fe1e731e347dba0083888 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:23 +0100 Subject: sched/core: Pass child domain into sd_init() If behavioural sched_domain flags depend on topology flags set at higher domain levels we need a way to update the child domain flags. Moving the child pointer assignment inside sd_init() should make that possible. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46bfb90aec00..57394650c6ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6381,7 +6381,8 @@ static int sched_domains_curr_level; SD_SHARE_POWERDOMAIN) static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, + struct sched_domain *child, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); int sd_weight, sd_flags = 0; @@ -6433,6 +6434,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, + .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, #endif @@ -6857,14 +6859,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu); + struct sched_domain *sd = sd_init(tl, child, cpu); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; - sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { -- cgit v1.2.3 From 9ee1cda5ee25c7dd82acf25892e0d229e818f8c7 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:24 +0100 Subject: sched/core: Enable SD_BALANCE_WAKE for asymmetric capacity systems A domain with the SD_ASYM_CPUCAPACITY flag set indicate that sched_groups at this level and below do not include CPUs of all capacities available (e.g. group containing little-only or big-only CPUs in big.LITTLE systems). It is therefore necessary to put in more effort in finding an appropriate CPU at task wake-up by enabling balancing at wake-up (SD_BALANCE_WAKE) on all lower (child) levels. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-8-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 57394650c6ab..4695df6ed752 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6444,6 +6444,13 @@ sd_init(struct sched_domain_topology_level *tl, * Convert topological properties into behaviour. */ + if (sd->flags & SD_ASYM_CPUCAPACITY) { + struct sched_domain *t = sd; + + for_each_lower_domain(t) + t->flags |= SD_BALANCE_WAKE; + } + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; -- cgit v1.2.3 From cd92bfd3b8cb0ec2ee825e55a3aee704cd55aea9 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 1 Aug 2016 19:53:35 +0100 Subject: sched/core: Store maximum per-CPU capacity in root domain To be able to compare the capacity of the target CPU with the highest available CPU capacity, store the maximum per-CPU capacity in the root domain. The max per-CPU capacity should be 1024 for all systems except SMT, where the capacity is currently based on smt_gain and the number of hardware threads and is <1024. If SMT can be brought to work with a per-thread capacity of 1024, this patch can be dropped and replaced by a hard-coded max capacity of 1024 (=SCHED_CAPACITY_SCALE). Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/26c69258-9947-f830-a53e-0c54e7750646@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++++++++ kernel/sched/sched.h | 2 ++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4695df6ed752..69243142cad1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6903,6 +6903,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -6953,11 +6954,22 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); + + /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ + if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); + if (rq) { + pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); + } + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index afe76d04e916..420c05d099c3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -565,6 +565,8 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; -- cgit v1.2.3 From 3273163c6775c4c21823985304c2364b08ca6ea2 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:26 +0100 Subject: sched/fair: Let asymmetric CPU configurations balance at wake-up Currently, SD_WAKE_AFFINE always takes priority over wakeup balancing if SD_BALANCE_WAKE is set on the sched_domains. For asymmetric configurations SD_WAKE_AFFINE is only desirable if the waking task's compute demand (utilization) is suitable for the waking CPU and the previous CPU, and all CPUs within their respective SD_SHARE_PKG_RESOURCES domains (sd_llc). If not, let wakeup balancing take over (find_idlest_{group, cpu}()). This patch makes affine wake-ups conditional on whether both the waker CPU and the previous CPU has sufficient capacity for the waking task, or not, assuming that the CPU capacities within an SD_SHARE_PKG_RESOURCES domain (sd_llc) are homogeneous. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-10-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index acdc351d2386..61d485421bed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * 1024 < capacity * margin + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -5376,6 +5382,32 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } +static inline int task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; + + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; + + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; + + return min_cap * 1024 < task_util(p) * capacity_margin; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5399,7 +5431,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); } rcu_read_lock(); -- cgit v1.2.3 From a1eb1411b4e4251db02179e39d234c2ee5192c72 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 17 Aug 2016 11:30:44 +0200 Subject: sched/cputime: Improve scalability by not accounting thread group tasks pending runtime Commit: d670ec13178d0 ("posix-cpu-timers: Cure SMP wobbles") started accounting thread group tasks pending runtime in thread_group_cputime(). Another commit: 6e998916dfe32 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") updated scheduler runtime statistics (call update_curr()) when reading task pending runtime. Those changes cause bad performance of SYS_times() and SYS_clock_gettimes(CLOCK_PROCESS_CPUTIME_ID) syscalls, especially on larger systems with many CPUs. While we would like to have cpuclock monotonicity kept i.e. have problems fixed by above commits stay fixed, we also would like to have good performance. However when we notice that change from commit d670ec13178d0 is not longer needed to solve problem addressed by that commit, because of change from the second commit 6e998916dfe32, we can get room for optimization. Since we update task while reading it's pending runtime in task_sched_runtime(), clock_gettime(CLOCK_PROCESS_CPUTIME_ID) will see updated values and on testcase from d670ec13178d0 process cpuclock will not be smaller than thread cpuclock. I tested the patch on testcases from commits d670ec13178d0, 6e998916dfe32 and some other cpuclock/cputimers testcases and did not found cpuclock monotonicity problems or other malfunction. This patch has the drawback that we will not provide thread group cputime up-to-date to the last moment. For example when arming cputime timer, we will arm it with possibly a bit outdated values and that timer will trigger earlier compared to behaviour without the patch. However that was the behaviour before d670ec13178d0 commit (kernel v3.1) so it's unlikely to affect applications. Patch improves related syscall performance, as measured by Giovanni's benchmarks described in commit: 6075620b0590e ("sched/cputime: Mitigate performance regression in times()/clock_gettime()") The benchmark results are: SYS_clock_gettime(): threads 4.7-rc7 3.18-rc3 4.7-rc7 + prefetch 4.7-rc7 + patch (pre-6e998916dfe3) 2 3.48 2.23 ( 35.68%) 3.06 ( 11.83%) 1.08 ( 68.81%) 5 3.33 2.83 ( 14.84%) 3.25 ( 2.40%) 0.71 ( 78.55%) 8 3.37 2.84 ( 15.80%) 3.26 ( 3.30%) 0.56 ( 83.49%) 12 3.32 3.09 ( 6.69%) 3.37 ( -1.60%) 0.42 ( 87.28%) 21 4.01 3.14 ( 21.70%) 3.90 ( 2.74%) 0.35 ( 91.35%) 30 3.63 3.28 ( 9.75%) 3.36 ( 7.41%) 0.28 ( 92.23%) 48 3.71 3.02 ( 18.69%) 3.11 ( 16.27%) 0.39 ( 89.39%) 79 3.75 2.88 ( 23.23%) 3.16 ( 15.74%) 0.46 ( 87.76%) 110 3.81 2.95 ( 22.62%) 3.25 ( 14.80%) 0.56 ( 85.41%) 128 3.88 3.05 ( 21.28%) 3.31 ( 14.76%) 0.62 ( 84.10%) SYS_times(): threads 4.7-rc7 3.18-rc3 4.7-rc7 + prefetch 4.7-rc7 + patch (pre-6e998916dfe3) 2 3.65 2.27 ( 37.94%) 3.25 ( 11.03%) 1.62 ( 55.71%) 5 3.45 2.78 ( 19.34%) 3.17 ( 7.92%) 2.33 ( 32.28%) 8 3.52 2.79 ( 20.66%) 3.22 ( 8.69%) 2.06 ( 41.44%) 12 3.29 3.02 ( 8.33%) 3.36 ( -2.04%) 2.00 ( 39.18%) 21 4.07 3.10 ( 23.86%) 3.92 ( 3.78%) 2.07 ( 49.18%) 30 3.87 3.33 ( 13.80%) 3.40 ( 12.17%) 1.89 ( 51.12%) 48 3.79 2.96 ( 21.94%) 3.16 ( 16.61%) 1.69 ( 55.46%) 79 3.88 2.88 ( 25.82%) 3.28 ( 15.42%) 1.60 ( 58.81%) 110 3.90 2.98 ( 23.73%) 3.38 ( 13.35%) 1.73 ( 55.61%) 128 4.00 3.10 ( 22.40%) 3.38 ( 15.45%) 1.66 ( 58.52%) Reported-and-tested-by: Giovanni Gherdovich Signed-off-by: Stanislaw Gruszka Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/20160817093043.GA25206@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a846cf89eb96..b93c72d5f64f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -306,6 +306,26 @@ static inline cputime_t account_other_time(cputime_t max) return accounted; } +#ifdef CONFIG_64BIT +static inline u64 read_sum_exec_runtime(struct task_struct *t) +{ + return t->se.sum_exec_runtime; +} +#else +static u64 read_sum_exec_runtime(struct task_struct *t) +{ + u64 ns; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(t, &rf); + ns = t->se.sum_exec_runtime; + task_rq_unlock(rq, t, &rf); + + return ns; +} +#endif + /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group. @@ -318,6 +338,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) unsigned int seq, nextseq; unsigned long flags; + /* + * Update current task runtime to account pending time since last + * scheduler action or thread_group_cputime() call. This thread group + * might have other running tasks on different CPUs, but updating + * their runtime can affect syscall performance, so we skip account + * those pending times and rely only on values updated on tick or + * other scheduler action. + */ + if (same_thread_group(current, tsk)) + (void) task_sched_runtime(current); + rcu_read_lock(); /* Attempt a lockless read on the first round. */ nextseq = 0; @@ -332,7 +363,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); + times->sum_exec_runtime += read_sum_exec_runtime(t); } /* If lockless access failed, take the lock. */ nextseq = 1; -- cgit v1.2.3 From 3942a9bd7b5842a924e99ee6ec1350b8006c94ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Aug 2016 18:54:13 +0200 Subject: locking, rcu, cgroup: Avoid synchronize_sched() in __cgroup_procs_write() The current percpu-rwsem read side is entirely free of serializing insns at the cost of having a synchronize_sched() in the write path. The latency of the synchronize_sched() is too high for cgroups. The commit 1ed1328792ff talks about the write path being a fairly cold path but this is not the case for Android which moves task to the foreground cgroup and back around binder IPC calls from foreground processes to background processes, so it is significantly hotter than human initiated operations. Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the problem, hopefully it should not be that slow after another commit: 80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global impact"). We could just add rcu_sync_enter() into cgroup_init() but we do not want another synchronize_sched() at boot time, so this patch adds the new helper which doesn't block but currently can only be called before the first use. Reported-by: John Stultz Reported-by: Dmitry Shmidt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Colin Cross Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rom Lemarchand Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Link: http://lkml.kernel.org/r/20160811165413.GA22807@redhat.com Signed-off-by: Ingo Molnar --- kernel/cgroup.c | 6 ++++++ kernel/rcu/sync.c | 12 ++++++++++++ 2 files changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..9f51cdf58f5a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5606,6 +5606,12 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + /* + * The latency of the synchronize_sched() is too high for cgroups, + * avoid it at the cost of forcing all readers into the slow path. + */ + rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + get_user_ns(init_cgroup_ns.user_ns); mutex_lock(&cgroup_mutex); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 198473d90f81..50d1861f7759 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -84,6 +84,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) rsp->gp_type = type; } +/** + * Must be called after rcu_sync_init() and before first use. + * + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() + * pairs turn into NO-OPs. + */ +void rcu_sync_enter_start(struct rcu_sync *rsp) +{ + rsp->gp_count++; + rsp->gp_state = GP_PASSED; +} + /** * rcu_sync_enter() - Force readers onto slowpath * @rsp: Pointer to rcu_sync structure to use for synchronization -- cgit v1.2.3 From 84b23f9b58687a11ced66cc4be9b0219e8ecab84 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Aug 2016 01:04:43 -0700 Subject: locking/rwsem: Return void in __rwsem_mark_wake() We currently return a rw_semaphore structure, which is the same lock we passed to the function's argument in the first place. While there are several functions that choose this return value, the callers use it, for example, for things like ERR_PTR. This is not the case for __rwsem_mark_wake(), and in addition this function is really about the lock waiters (which we know there are at this point), so its somewhat odd to be returning the sem structure. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: dave@stgolabs.net Cc: jason.low2@hpe.com Cc: wanpeng.li@hotmail.com Link: http://lkml.kernel.org/r/1470384285-32163-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 447e08de1fab..b03623172277 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -121,16 +121,17 @@ enum rwsem_wake_type { * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false */ -static struct rw_semaphore * -__rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) +static void __rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) { struct rwsem_waiter *waiter; struct task_struct *tsk; struct list_head *next; - long oldcount, woken, loop, adjustment; + long loop, oldcount, woken = 0, adjustment = 0; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { /* @@ -142,19 +143,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ wake_q_add(wake_q, waiter->task); } - goto out; + + return; } - /* Writers might steal the lock before we grant it to the next reader. + /* + * Writers might steal the lock before we grant it to the next reader. * We prefer to do the first reader grant before counting readers * so we can bail out early if a writer stole the lock. */ - adjustment = 0; if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* * If the count is still less than RWSEM_WAITING_BIAS @@ -164,7 +165,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ if (atomic_long_add_return(-adjustment, &sem->count) < RWSEM_WAITING_BIAS) - goto out; + return; + /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } @@ -176,11 +178,11 @@ __rwsem_mark_wake(struct rw_semaphore *sem, rwsem_set_reader_owned(sem); } - /* Grant an infinite number of read locks to the readers at the front + /* + * Grant an infinite number of read locks to the readers at the front * of the queue. Note we increment the 'active part' of the count by * the number of readers before waking any processes up. */ - woken = 0; do { woken++; @@ -219,9 +221,6 @@ __rwsem_mark_wake(struct rw_semaphore *sem, sem->wait_list.next = next; next->prev = &sem->wait_list; - - out: - return sem; } /* @@ -255,7 +254,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); @@ -505,7 +504,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) if (count > RWSEM_WAITING_BIAS) { WAKE_Q(wake_q); - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock * is released, but given that we are proactively waking @@ -616,7 +615,7 @@ locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); @@ -640,7 +639,7 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); -- cgit v1.2.3 From c2867bbaf5d8f1534cae15175a389c5cbf58fec1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Aug 2016 01:04:44 -0700 Subject: locking/rwsem: Remove a few useless comments Our rwsem code (xadd, at least) is rather well documented, but there are a few really annoying comments in there that serve no purpose and we shouldn't bother with them. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: dave@stgolabs.net Cc: jason.low2@hpe.com Cc: wanpeng.li@hotmail.com Link: http://lkml.kernel.org/r/1470384285-32163-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b03623172277..e02fe3289b5a 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -234,7 +234,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) struct task_struct *tsk = current; WAKE_Q(wake_q); - /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; @@ -613,7 +612,6 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); locked: - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); @@ -637,7 +635,6 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); -- cgit v1.2.3 From 70800c3c0cc525baa38fd0fe4660f2c27f1bfeeb Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Aug 2016 01:04:45 -0700 Subject: locking/rwsem: Scan the wait_list for readers only once When wanting to wakeup readers, __rwsem_mark_wakeup() currently iterates the wait_list twice while looking to wakeup the first N queued reader-tasks. While this can be quite inefficient, it was there such that a awoken reader would be first and foremost acknowledged by the lock counter. Keeping the same logic, we can further benefit from the use of wake_qs and avoid entirely the first wait_list iteration that sets the counter as wake_up_process() isn't going to occur right away, and therefore we maintain the counter->list order of going about things. Other than saving cycles with O(n) "scanning", this change also nicely cleans up a good chunk of __rwsem_mark_wakeup(); both visually and less tedious to read. For example, the following improvements where seen on some will it scale microbenchmarks, on a 48-core Haswell: v4.7 v4.7-rwsem-v1 Hmean signal1-processes-8 5792691.42 ( 0.00%) 5771971.04 ( -0.36%) Hmean signal1-processes-12 6081199.96 ( 0.00%) 6072174.38 ( -0.15%) Hmean signal1-processes-21 3071137.71 ( 0.00%) 3041336.72 ( -0.97%) Hmean signal1-processes-48 3712039.98 ( 0.00%) 3708113.59 ( -0.11%) Hmean signal1-processes-79 4464573.45 ( 0.00%) 4682798.66 ( 4.89%) Hmean signal1-processes-110 4486842.01 ( 0.00%) 4633781.71 ( 3.27%) Hmean signal1-processes-141 4611816.83 ( 0.00%) 4692725.38 ( 1.75%) Hmean signal1-processes-172 4638157.05 ( 0.00%) 4714387.86 ( 1.64%) Hmean signal1-processes-203 4465077.80 ( 0.00%) 4690348.07 ( 5.05%) Hmean signal1-processes-224 4410433.74 ( 0.00%) 4687534.43 ( 6.28%) Stddev signal1-processes-8 6360.47 ( 0.00%) 8455.31 ( 32.94%) Stddev signal1-processes-12 4004.98 ( 0.00%) 9156.13 (128.62%) Stddev signal1-processes-21 3273.14 ( 0.00%) 5016.80 ( 53.27%) Stddev signal1-processes-48 28420.25 ( 0.00%) 26576.22 ( -6.49%) Stddev signal1-processes-79 22038.34 ( 0.00%) 18992.70 (-13.82%) Stddev signal1-processes-110 23226.93 ( 0.00%) 17245.79 (-25.75%) Stddev signal1-processes-141 6358.98 ( 0.00%) 7636.14 ( 20.08%) Stddev signal1-processes-172 9523.70 ( 0.00%) 4824.75 (-49.34%) Stddev signal1-processes-203 13915.33 ( 0.00%) 9326.33 (-32.98%) Stddev signal1-processes-224 15573.94 ( 0.00%) 10613.82 (-31.85%) Other runs that saw improvements include context_switch and pipe; and as expected, this is particularly highlighted on larger thread counts as it becomes more expensive to walk the list twice. No change in wakeup ordering or semantics. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: dave@stgolabs.net Cc: jason.low2@hpe.com Cc: wanpeng.li@hotmail.com Link: http://lkml.kernel.org/r/1470384285-32163-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 58 ++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index e02fe3289b5a..2337b4bb2366 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -125,12 +125,14 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter; - struct task_struct *tsk; - struct list_head *next; - long loop, oldcount, woken = 0, adjustment = 0; + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { @@ -180,36 +182,21 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, /* * Grant an infinite number of read locks to the readers at the front - * of the queue. Note we increment the 'active part' of the count by - * the number of readers before waking any processes up. + * of the queue. We know that woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. */ - do { - woken++; + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + struct task_struct *tsk; - if (waiter->list.next == &sem->wait_list) + if (waiter->type == RWSEM_WAITING_FOR_WRITE) break; - waiter = list_entry(waiter->list.next, - struct rwsem_waiter, list); - - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - - if (adjustment) - atomic_long_add(adjustment, &sem->count); - - next = sem->wait_list.next; - loop = woken; - do { - waiter = list_entry(next, struct rwsem_waiter, list); - next = waiter->list.next; + woken++; tsk = waiter->task; wake_q_add(wake_q, tsk); + list_del(&waiter->list); /* * Ensure that the last operation is setting the reader * waiter to nil such that rwsem_down_read_failed() cannot @@ -217,10 +204,16 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * to the task to wakeup. */ smp_store_release(&waiter->task, NULL); - } while (--loop); + } - sem->wait_list.next = next; - next->prev = &sem->wait_list; + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + } + + if (adjustment) + atomic_long_add(adjustment, &sem->count); } /* @@ -245,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); - /* If there are no active locks, wake the front queued process(es). + /* + * If there are no active locks, wake the front queued process(es). * * If there are no writers and we are first in the queue, * wake our own waiter to join the existing active readers ! -- cgit v1.2.3 From 255e732c61dbb6a0bf9e0a3d6bc45f202853c880 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 17 Aug 2016 20:58:28 -0400 Subject: livepatch: use arch_klp_init_object_loaded() to finish arch-specific tasks Introduce arch_klp_init_object_loaded() to complete any additional arch-specific tasks during patching. Architecture code may override this function. Signed-off-by: Jessica Yu Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 8bbe50704621..5fbabe022286 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -274,7 +274,6 @@ static int klp_write_object_relocations(struct module *pmod, objname = klp_is_module(obj) ? obj->name : "vmlinux"; - module_disable_ro(pmod); /* For each klp relocation section */ for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { sec = pmod->klp_info->sechdrs + i; @@ -309,7 +308,6 @@ static int klp_write_object_relocations(struct module *pmod, break; } - module_enable_ro(pmod, true); return ret; } @@ -763,6 +761,12 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } +/* Arches may override this to finish any remaining arch-specific tasks */ +void __weak arch_klp_init_object_loaded(struct klp_patch *patch, + struct klp_object *obj) +{ +} + /* parts of the initialization that is done only when the object is loaded */ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_object *obj) @@ -770,9 +774,15 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; + module_disable_ro(patch->mod); ret = klp_write_object_relocations(patch->mod, obj); - if (ret) + if (ret) { + module_enable_ro(patch->mod, true); return ret; + } + + arch_klp_init_object_loaded(patch, obj); + module_enable_ro(patch->mod, true); klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, -- cgit v1.2.3 From bedc1969150d480c462cdac320fa944b694a7162 Mon Sep 17 00:00:00 2001 From: Ding Tianhong Date: Wed, 15 Jun 2016 15:27:36 +0800 Subject: rcu: Fix soft lockup for rcu_nocb_kthread Carrying out the following steps results in a softlockup in the RCU callback-offload (rcuo) kthreads: 1. Connect to ixgbevf, and set the speed to 10Gb/s. 2. Use ifconfig to bring the nic up and down repeatedly. [ 317.005148] IPv6: ADDRCONF(NETDEV_CHANGE): eth2: link becomes ready [ 368.106005] BUG: soft lockup - CPU#1 stuck for 22s! [rcuos/1:15] [ 368.106005] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 368.106005] task: ffff88057dd8a220 ti: ffff88057dd9c000 task.ti: ffff88057dd9c000 [ 368.106005] RIP: 0010:[] [] fib_table_lookup+0x14/0x390 [ 368.106005] RSP: 0018:ffff88061fc83ce8 EFLAGS: 00000286 [ 368.106005] RAX: 0000000000000001 RBX: 00000000020155c0 RCX: 0000000000000001 [ 368.106005] RDX: ffff88061fc83d50 RSI: ffff88061fc83d70 RDI: ffff880036d11a00 [ 368.106005] RBP: ffff88061fc83d08 R08: 0000000000000001 R09: 0000000000000000 [ 368.106005] R10: ffff880036d11a00 R11: ffffffff819e0900 R12: ffff88061fc83c58 [ 368.106005] R13: ffffffff816154dd R14: ffff88061fc83d08 R15: 00000000020155c0 [ 368.106005] FS: 0000000000000000(0000) GS:ffff88061fc80000(0000) knlGS:0000000000000000 [ 368.106005] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 368.106005] CR2: 00007f8c2aee9c40 CR3: 000000057b222000 CR4: 00000000000407e0 [ 368.106005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 368.106005] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 368.106005] Stack: [ 368.106005] 00000000010000c0 ffff88057b766000 ffff8802e380b000 ffff88057af03e00 [ 368.106005] ffff88061fc83dc0 ffffffff815349a6 ffff88061fc83d40 ffffffff814ee146 [ 368.106005] ffff8802e380af00 00000000e380af00 ffffffff819e0900 020155c0010000c0 [ 368.106005] Call Trace: [ 368.106005] [ 368.106005] [ 368.106005] [] ip_route_input_noref+0x516/0xbd0 [ 368.106005] [] ? skb_release_data+0xd6/0x110 [ 368.106005] [] ? kfree_skb+0x3a/0xa0 [ 368.106005] [] ip_rcv_finish+0x29f/0x350 [ 368.106005] [] ip_rcv+0x234/0x380 [ 368.106005] [] __netif_receive_skb_core+0x676/0x870 [ 368.106005] [] __netif_receive_skb+0x18/0x60 [ 368.106005] [] process_backlog+0xae/0x180 [ 368.106005] [] net_rx_action+0x152/0x240 [ 368.106005] [] __do_softirq+0xef/0x280 [ 368.106005] [] call_softirq+0x1c/0x30 [ 368.106005] [ 368.106005] [ 368.106005] [] do_softirq+0x65/0xa0 [ 368.106005] [] local_bh_enable+0x94/0xa0 [ 368.106005] [] rcu_nocb_kthread+0x232/0x370 [ 368.106005] [] ? wake_up_bit+0x30/0x30 [ 368.106005] [] ? rcu_start_gp+0x40/0x40 [ 368.106005] [] kthread+0xcf/0xe0 [ 368.106005] [] ? kthread_create_on_node+0x140/0x140 [ 368.106005] [] ret_from_fork+0x58/0x90 [ 368.106005] [] ? kthread_create_on_node+0x140/0x140 ==================================cut here============================== It turns out that the rcuos callback-offload kthread is busy processing a very large quantity of RCU callbacks, and it is not reliquishing the CPU while doing so. This commit therefore adds an cond_resched_rcu_qs() within the loop to allow other tasks to run. Signed-off-by: Ding Tianhong [ paulmck: Substituted cond_resched_rcu_qs for cond_resched. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0082fce402a0..85c5a883c6e3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2173,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); + cond_resched_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); -- cgit v1.2.3 From f7b8eb847e35b18d3ec333774691a905bf16017f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Jun 2016 11:30:32 -0700 Subject: rcu: Consolidate expedited grace period machinery The functions synchronize_rcu_expedited() and synchronize_sched_expedited() have nearly identical code. This commit therefore consolidates this code into a new _synchronize_rcu_expedited() function. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 62 ++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6d86ab6ec2c9..1549f456fb7b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -516,6 +516,33 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } +/* + * Given an rcu_state pointer and a smp_call_function() handler, kick + * off the specified flavor of expedited grace period. + */ +static void _synchronize_rcu_expedited(struct rcu_state *rsp, + smp_call_func_t func) +{ + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(rsp->call); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, func); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -534,29 +561,13 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) */ void synchronize_sched_expedited(void) { - unsigned long s; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ if (rcu_blocking_is_gp()) return; - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); + _synchronize_rcu_expedited(rsp, sync_sched_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -620,23 +631,8 @@ static void sync_rcu_exp_handler(void *info) void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; - unsigned long s; - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - - /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ - rcu_exp_wait_wake(rsp, s); + _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- cgit v1.2.3 From 8b355e3bc1408be238ae4695fb6318ae502cae8e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jun 2016 13:46:25 -0700 Subject: rcu: Drive expedited grace periods from workqueue The current implementation of expedited grace periods has the user task drive the grace period. This works, but has downsides: (1) The user task must awaken tasks piggybacking on this grace period, which can result in latencies rivaling that of the grace period itself, and (2) User tasks can receive signals, which interfere with RCU CPU stall warnings. This commit therefore uses workqueues to drive the grace periods, so that the user task need not do the awakening. A subsequent commit will remove the now-unnecessary code allowing for signals. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 48 ++++++++++++++++++++++++++++++++++++++++++------ kernel/rcu/tree_trace.c | 7 ++++--- 3 files changed, 47 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f714f873bf9d..e99a5234d9ed 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -400,6 +400,7 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + atomic_long_t exp_workdone0; /* # done by workqueue. */ atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1549f456fb7b..97f5ffe42b58 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -500,7 +500,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) * next GP, to proceed. */ mutex_lock(&rsp->exp_wake_mutex); - mutex_unlock(&rsp->exp_mutex); rcu_for_each_node_breadth_first(rsp, rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { @@ -516,6 +515,29 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } +/* Let the workqueue handler know what it is supposed to do. */ +struct rcu_exp_work { + smp_call_func_t rew_func; + struct rcu_state *rew_rsp; + unsigned long rew_s; + struct work_struct rew_work; +}; + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + /* Initialize the rcu_node tree in preparation for the wait. */ + rewp = container_of(wp, struct rcu_exp_work, rew_work); + sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s); +} + /* * Given an rcu_state pointer and a smp_call_function() handler, kick * off the specified flavor of expedited grace period. @@ -523,6 +545,9 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) static void _synchronize_rcu_expedited(struct rcu_state *rsp, smp_call_func_t func) { + struct rcu_data *rdp; + struct rcu_exp_work rew; + struct rcu_node *rnp; unsigned long s; /* If expedited grace periods are prohibited, fall back to normal. */ @@ -536,11 +561,22 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, func); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); + /* Marshall arguments and schedule the expedited grace period. */ + rew.rew_func = func; + rew.rew_rsp = rsp; + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + schedule_work(&rew.rew_work); + + /* Wait for expedited grace period to complete. */ + rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + rnp = rcu_get_root(rsp); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone0, s)); + + /* Let the next expedited grace period start. */ + mutex_unlock(&rsp->exp_mutex); } /** diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 86782f9a4604..b1f28972872c 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s1 = 0, s2 = 0, s3 = 0; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->exp_workdone0); s1 += atomic_long_read(&rdp->exp_workdone1); s2 += atomic_long_read(&rdp->exp_workdone2); s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s1, s2, s3, + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); -- cgit v1.2.3 From 908d2c1fd156d414008e1b7e1fb5a7716e013231 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jun 2016 14:34:59 -0700 Subject: rcu: Stop disabling expedited RCU CPU stall warnings Now that RCU expedited grace periods are always driven by a workqueue, there is no need to account for signal reception, and thus no need to disable expedited RCU CPU stall warnings due to signal reception. This commit therefore removes the signal-reception checks, leaving a WARN_ON() to catch possible future bugs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 97f5ffe42b58..3a647eb96f23 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -427,12 +427,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_stall); if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } + WARN_ON(ret < 0); /* workqueues should not be signaled. */ pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); ndetected = 0; -- cgit v1.2.3 From 24a6cff286030b98149ff10b968cba31280fcb7a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jun 2016 14:49:29 -0700 Subject: rcu: Make expedited RCU CPU stall warnings respond to controls The expedited RCU CPU stall warnings currently responds to neither the panic_on_rcu_stall sysctl setting nor the rcupdate.rcu_cpu_stall_suppress kernel boot parameter. This commit therefore updates the expedited code to respond to these two controls. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3a647eb96f23..f316683b18f1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -428,6 +428,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ + if (rcu_cpu_stall_suppress) + continue; + panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); ndetected = 0; -- cgit v1.2.3 From 98834b83785e1388fa8672cf4f8de09974d15e86 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jun 2016 17:04:19 -0700 Subject: rcu: Exclude RCU-offline CPUs from expedited grace periods The expedited RCU grace periods currently rely on a failure indication from smp_call_function_single() to determine that a given CPU is offline. This works after a fashion, but is more contorted and less precise than relying on RCU's internal state. This commit therefore takes a first step towards relying on internal state. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f316683b18f1..3bc4b3dda801 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -359,7 +359,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + !(atomic_add_return(0, &rdtp->dynticks) & 0x1) || + !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; -- cgit v1.2.3 From 385c859f678e8ee6b0b122086f34e72a0e861cef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jun 2016 12:16:11 -0700 Subject: rcu: Use RCU's online-CPU state for expedited IPI retry This commit improves the accuracy of the interaction between CPU hotplug operations and RCU's expedited grace periods by using RCU's online-CPU state to determine when failed IPIs should be retried. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3bc4b3dda801..24343eb87b58 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -385,17 +385,16 @@ retry_ipi: mask_ofl_ipi &= ~mask; continue; } - /* Failed, raced with offline. */ + /* Failed, raced with CPU hotplug operation. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && + if ((rnp->qsmaskinitnext & mask) && (rnp->expmask & mask)) { + /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + goto retry_ipi; } + /* CPU really is offline, so we can ignore it. */ if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -- cgit v1.2.3 From 94d44776737266eccafee32b985fe31fd5e021ca Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 22 Jun 2016 17:19:27 +0800 Subject: rcu: Use rcu_gp_kthread_wake() to wake up grace period kthreads Commit abedf8e2419f ("rcu: Use simple wait queues where possible in rcutree") converts Tree RCU's wait queues to simple wait queues, but it incorrectly reverts the commit 2aa792e6faf1 ("rcu: Use rcu_gp_kthread_wake() to wake up grace period kthreads"). This can result in redundant self-wakeups. This commit therefore replaces the simple wait-queue wakeups with rcu_gp_kthread_wake(), thus avoiding the redundant wakeups. Signed-off-by: Jisheng Zhang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..cc1779a7ec5f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2344,7 +2344,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -2970,7 +2970,7 @@ static void force_quiescent_state(struct rcu_state *rsp) } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); - swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* -- cgit v1.2.3 From 379d9ecb3cc9d5d043216185904c00e54c736a96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jun 2016 10:37:20 -0700 Subject: sched: Make wake_up_nohz_cpu() handle CPUs going offline Both timers and hrtimers are maintained on the outgoing CPU until CPU_DEAD time, at which point they are migrated to a surviving CPU. If a mod_timer() executes between CPU_DYING and CPU_DEAD time, x86 systems will splat in native_smp_send_reschedule() when attempting to wake up the just-now-offlined CPU, as shown below from a NO_HZ_FULL kernel: [ 7976.741556] WARNING: CPU: 0 PID: 661 at /home/paulmck/public_git/linux-rcu/arch/x86/kernel/smp.c:125 native_smp_send_reschedule+0x39/0x40 [ 7976.741595] Modules linked in: [ 7976.741595] CPU: 0 PID: 661 Comm: rcu_torture_rea Not tainted 4.7.0-rc2+ #1 [ 7976.741595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 [ 7976.741595] 0000000000000000 ffff88000002fcc8 ffffffff8138ab2e 0000000000000000 [ 7976.741595] 0000000000000000 ffff88000002fd08 ffffffff8105cabc 0000007d1fd0ee18 [ 7976.741595] 0000000000000001 ffff88001fd16d40 ffff88001fd0ee00 ffff88001fd0ee00 [ 7976.741595] Call Trace: [ 7976.741595] [] dump_stack+0x67/0x99 [ 7976.741595] [] __warn+0xcc/0xf0 [ 7976.741595] [] warn_slowpath_null+0x18/0x20 [ 7976.741595] [] native_smp_send_reschedule+0x39/0x40 [ 7976.741595] [] wake_up_nohz_cpu+0x82/0x190 [ 7976.741595] [] internal_add_timer+0x7a/0x80 [ 7976.741595] [] mod_timer+0x187/0x2b0 [ 7976.741595] [] rcu_torture_reader+0x33d/0x380 [ 7976.741595] [] ? sched_torture_read_unlock+0x30/0x30 [ 7976.741595] [] ? rcu_bh_torture_read_lock+0x80/0x80 [ 7976.741595] [] kthread+0xdf/0x100 [ 7976.741595] [] ret_from_fork+0x1f/0x40 [ 7976.741595] [] ? kthread_create_on_node+0x200/0x200 However, in this case, the wakeup is redundant, because the timer migration will reprogram timer hardware as needed. Note that the fact that preemption is disabled does not avoid the splat, as the offline operation has already passed both the synchronize_sched() and the stop_machine() that would be blocked by disabled preemption. This commit therefore modifies wake_up_nohz_cpu() to avoid attempting to wake up offline CPUs. It also adds a comment stating that the caller must tolerate lost wakeups when the target CPU is going offline, and suggesting the CPU_DEAD notifier as a recovery mechanism. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner --- kernel/sched/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c883fe8e440..2a18856f00ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -580,6 +580,8 @@ static bool wake_up_full_nohz_cpu(int cpu) * If needed we can still optimize that later with an * empty IRQ. */ + if (cpu_is_offline(cpu)) + return true; /* Don't try to wake offline CPUs. */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) @@ -590,6 +592,11 @@ static bool wake_up_full_nohz_cpu(int cpu) return false; } +/* + * Wake up the specified CPU. If the CPU is going offline, it is the + * caller's responsibility to deal with the lost wakeup, for example, + * by hooking into the CPU_DEAD notifier like timers and hrtimers do. + */ void wake_up_nohz_cpu(int cpu) { if (!wake_up_full_nohz_cpu(cpu)) -- cgit v1.2.3 From e77b7041258e11ba198951553d3acf1e371a9053 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 15 Jul 2016 12:19:41 -0400 Subject: rcu: Don't use modular infrastructure in non-modular code The Kconfig currently controlling compilation of tree.c is: init/Kconfig:config TREE_RCU init/Kconfig: bool ...and update.c and sync.c are "obj-y" meaning that none are ever built as a module by anyone. Since MODULE_ALIAS is a no-op for non-modular code, we can remove them from these files. We leave moduleparam.h behind since the files instantiate some boot time configuration parameters with module_param() still. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- kernel/rcu/update.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cc1779a7ec5f..e83446062f65 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -60,7 +59,6 @@ #include "tree.h" #include "rcu.h" -MODULE_ALIAS("rcutree"); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f0d8322bc3ec..f19271dce0a9 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include @@ -54,7 +54,6 @@ #include "rcu.h" -MODULE_ALIAS("rcupdate"); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif -- cgit v1.2.3 From 3563a438f124cb0b8cfd350c86de2f26c63d8837 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jul 2016 09:39:11 -0700 Subject: rcu: Avoid redundant quiescent-state chasing Currently, __note_gp_changes() checks to see if the CPU has slept through multiple grace periods. If it has, it resynchronizes that CPU's view of the grace-period state, which includes whether or not the current grace period needs a quiescent state from this CPU. The fact of this need (or lack thereof) needs to be in two places, rdp->cpu_no_qs.b.norm and rdp->core_needs_qs. The former tells RCU's context-switch code to go get a quiescent state and the latter says that it needs to be reported. The current code unconditionally sets the former to true, but correctly sets the latter. This does not result in failures, but it does unnecessarily increase the amount of work done on average at context-switch time. This commit therefore correctly sets both fields. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e83446062f65..733902c33dd2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1846,6 +1846,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { bool ret; + bool need_gp; /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && @@ -1872,9 +1873,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->cpu_no_qs.b.norm = true; + need_gp = !!(rnp->qsmask & rdp->grpmask); + rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); + rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); } -- cgit v1.2.3 From 7ec99de36f402618ae44147ac7fa9a07e4757a5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jun 2016 13:58:26 -0700 Subject: rcu: Provide exact CPU-online tracking for RCU Up to now, RCU has assumed that the CPU-online process makes it from CPU_UP_PREPARE to set_cpu_online() within one jiffy. Given the recent rise of virtualized environments, this assumption is very clearly obsolete. Failing to meet this deadline can result in RCU paying attention to an incoming CPU for one jiffy, then ignoring it until the grace period following the one in which that CPU sets itself online. This situation might prove to be fatally disappointing to any RCU read-side critical sections that had the misfortune to execute during the time in which RCU was ignoring the slow-to-come-online CPU. This commit therefore updates RCU's internal CPU state-tracking information at notify_cpu_starting() time, thus providing RCU with an exact transition of the CPU's state from offline to online. Note that this means that incoming CPUs must not use RCU read-side critical section (other than those of SRCU) until notify_cpu_starting() time. Note also that the CPU_STARTING notifiers -are- allowed to use RCU read-side critical sections. (Of course, CPU-hotplug notifiers are rapidly becoming obsolete, so you need to act fast!) If a given architecture or CPU family needs to use RCU read-side critical sections earlier, the call to rcu_cpu_starting() from notify_cpu_starting() will need to be architecture-specific, with architectures that need early use being required to hand-place the call to rcu_cpu_starting() at some point preceding the call to notify_cpu_starting(). Signed-off-by: Paul E. McKenney --- kernel/cpu.c | 1 + kernel/rcu/tree.c | 32 +++++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 341bf80f80bd..9482ceb928e0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -889,6 +889,7 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + rcu_cpu_starting(cpu); /* All CPU_STARTING notifiers can use RCU. */ while (st->state < target) { struct cpuhp_step *step; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..d2973fb85e8c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3792,8 +3792,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rnp->qsmaskinitnext |= mask; - rnp->expmaskinitnext |= mask; if (!rdp->beenonline) WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); rdp->beenonline = true; /* We have now been online. */ @@ -3860,6 +3858,32 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +/* + * Mark the specified CPU as being online so that subsequent grace periods + * (both expedited and normal) will wait on it. Note that this means that + * incoming CPUs are not allowed to use RCU read-side critical sections + * until this function is called. Failing to observe this restriction + * will result in lockdep splats. + */ +void rcu_cpu_starting(unsigned int cpu) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->qsmaskinitnext |= mask; + rnp->expmaskinitnext |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + #ifdef CONFIG_HOTPLUG_CPU /* * The CPU is exiting the idle loop into the arch_cpu_idle_dead() @@ -4209,8 +4233,10 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + } } #include "tree_exp.h" -- cgit v1.2.3 From 0c6d4576c45736f829dc3390ac95181b2ed21bc7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 17 Aug 2016 14:21:04 +0200 Subject: cpu/hotplug: Get rid of CPU_STARTING reference CPU_STARTING is scheduled for removal. There is no use of it in drivers and core code uses it only for compatibility with old-style CPU-hotplug notifiers. This patch removes therefore removes CPU_STARTING from an RCU-related comment. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9482ceb928e0..ff8bc3817dde 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -889,7 +889,7 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - rcu_cpu_starting(cpu); /* All CPU_STARTING notifiers can use RCU. */ + rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { struct cpuhp_step *step; -- cgit v1.2.3 From 0ffd374b2207a1a0cba9f2dbcc799198482391d5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:22 +0200 Subject: rcutorture: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. Cc: Josh Triplett Cc: "Paul E. McKenney" Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 52 +++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e2b138063..dc9814860645 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1362,12 +1362,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) onoff_interval, onoff_holdoff); } -static void rcutorture_booster_cleanup(int cpu) +static int rcutorture_booster_cleanup(unsigned int cpu) { struct task_struct *t; if (boost_tasks[cpu] == NULL) - return; + return 0; mutex_lock(&boost_mutex); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; @@ -1375,9 +1375,10 @@ static void rcutorture_booster_cleanup(int cpu) /* This must be outside of the mutex, otherwise deadlock! */ torture_stop_kthread(rcu_torture_boost, t); + return 0; } -static int rcutorture_booster_init(int cpu) +static int rcutorture_booster_init(unsigned int cpu) { int retval; @@ -1577,28 +1578,7 @@ static void rcu_torture_barrier_cleanup(void) } } -static int rcutorture_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - (void)rcutorture_booster_init(cpu); - break; - case CPU_DOWN_PREPARE: - rcutorture_booster_cleanup(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block rcutorture_cpu_nb = { - .notifier_call = rcutorture_cpu_notify, -}; +static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) @@ -1638,11 +1618,8 @@ rcu_torture_cleanup(void) for (i = 0; i < ncbflooders; i++) torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - unregister_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) - rcutorture_booster_cleanup(i); - } + test_boost == 2) + cpuhp_remove_state(rcutor_hp); /* * Wait for all RCU callbacks to fire, then do flavor-specific @@ -1869,14 +1846,13 @@ rcu_torture_init(void) test_boost == 2) { boost_starttime = jiffies + test_boost_interval * HZ; - register_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) { - if (cpu_is_offline(i)) - continue; /* Heuristic: CPU can go offline. */ - firsterr = rcutorture_booster_init(i); - if (firsterr) - goto unwind; - } + + firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE", + rcutorture_booster_init, + rcutorture_booster_cleanup); + if (firsterr < 0) + goto unwind; + rcutor_hp = firsterr; } firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) -- cgit v1.2.3 From 31257c3c8b7307f106d67345755d937cb5fb8bd4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 18 Jun 2016 07:45:43 -0700 Subject: torture: Convert torture_shutdown() to hrtimer Upcoming changes to the timer wheel introduce significant inaccuracy and possibly also an ultimate limit on timeout duration. This is a problem for the current implementation of torture_shutdown() because (1) shutdown times are user-specified, and can therefore be quite long, and (2) the torture scripting will kill a test instance that runs for more than a few minutes longer than scheduled. This commit therefore converts the torture_shutdown() timed waits to an hrtimer, thus avoiding too-short torture test runs as well as death by scripting. Signed-off-by: Paul E. McKenney Acked-by: Arnd Bergmann --- kernel/torture.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 75961b3decfe..0d887eb62856 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -446,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); * Variables for auto-shutdown. This allows "lights out" torture runs * to be fully scripted. */ -static int shutdown_secs; /* desired test duration in seconds. */ static struct task_struct *shutdown_task; -static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static ktime_t shutdown_time; /* time to system shutdown. */ static void (*torture_shutdown_hook)(void); /* @@ -471,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb); */ static int torture_shutdown(void *arg) { - long delta; - unsigned long jiffies_snap; + ktime_t ktime_snap; VERBOSE_TOROUT_STRING("torture_shutdown task started"); - jiffies_snap = jiffies; - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + ktime_snap = ktime_get(); + while (ktime_before(ktime_snap, shutdown_time) && !torture_must_stop()) { - delta = shutdown_time - jiffies_snap; if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = jiffies; + "torture_shutdown task: %llu ms remaining\n", + torture_type, + ktime_ms_delta(shutdown_time, ktime_snap)); + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS); + ktime_snap = ktime_get(); } if (torture_must_stop()) { torture_kthread_stopping("torture_shutdown"); @@ -511,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) { int ret = 0; - shutdown_secs = ssecs; torture_shutdown_hook = cleanup; - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; + if (ssecs > 0) { + shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); ret = torture_create_kthread(torture_shutdown, NULL, shutdown_task); } -- cgit v1.2.3 From 4ffa66992476c94d8b4d33b2c792d336a400ada2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jun 2016 11:56:38 -0700 Subject: torture: Add task state to writer-task stall printk()s This commit adds a dump of the scheduler state for stalled rcutorture writer tasks. This addition provides yet more debug for the intermittent "failures to proceed", where grace periods move ahead but the rcutorture writer tasks fail to do so. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e2b138063..f0f32f888ec5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1238,6 +1238,7 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; static unsigned long rtcv_snap = ULONG_MAX; + struct task_struct *wtp; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { @@ -1312,10 +1313,12 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + wtp = READ_ONCE(writer_task); + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, - gpnum, completed, flags); + gpnum, completed, flags, + wtp == NULL ? ~0UL : wtp->state); show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } -- cgit v1.2.3 From 472213a675e21185416101a77102253f93713fa9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 13 Aug 2016 15:54:35 +0900 Subject: rcutorture: Print out barrier error as document says Tests for rcu_barrier() were introduced by commit fae4b54f28f0 ("rcu: Introduce rcutorture testing for rcu_barrier()"). This commit updated the documentation to say that the "rtbe" field in rcutorture's dmesg output indicates test failure. However, the code was not updated, only the documentation. This commit therefore updates the code to match the updated documentation. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f0f32f888ec5..ac29017623e5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1259,8 +1259,9 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_barrier_error, n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); pr_cont("rtbf: %ld rtb: %ld nt: %ld ", -- cgit v1.2.3 From a56fefa2605cf8e125ef09451487f30336128028 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 21 Aug 2016 16:54:39 +0900 Subject: rcuperf: Consistently insert space between flag and message A few rcuperf dmesg output messages have no space between the flag and the start of the message. In contrast, every other messages consistently supplies a single space. This difference makes rcuperf dmesg output hard to read and to mechanically parse. This commit therefore fixes this problem by modifying a pr_alert() call and PERFOUT_STRING() macro function to provide that single space. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d38ab08a3fe7..123ccbd22449 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -52,7 +52,7 @@ MODULE_AUTHOR("Paul E. McKenney "); #define PERF_FLAG "-perf:" #define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG s "\n", perf_type) + pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) #define VERBOSE_PERFOUT_STRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) #define VERBOSE_PERFOUT_ERRSTRING(s) \ @@ -400,9 +400,8 @@ rcu_perf_writer(void *arg) sp.sched_priority = 0; sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - pr_alert("%s" PERF_FLAG - "rcu_perf_writer %ld has %d measurements\n", - perf_type, me, MIN_MEAS); + pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", + perf_type, PERF_FLAG, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= nrealwriters) { schedule_timeout_interruptible(10); -- cgit v1.2.3 From 17ce3dc7e5a0e4796cc7838d1f7b2531d0bca130 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 18 Aug 2016 17:57:50 +0900 Subject: ftrace: kprobe: uprobe: Add x8/x16/x32/x64 for hexadecimal types Add x8/x16/x32/x64 for hexadecimal type casting to kprobe/uprobe event tracer. These type casts can be used for integer arguments for explicitly showing them in hexadecimal digits in formatted text. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Alexander Shishkin Cc: Hemant Kumar Cc: Naohiro Aota Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/147151067029.12957.11591314629326414783.stgit@devbox Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 4 ++++ kernel/trace/trace_probe.c | 30 +++++++++++++++++------------- kernel/trace/trace_probe.h | 9 +++++++++ kernel/trace/trace_uprobe.c | 4 ++++ 4 files changed, 34 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9aedb0b06683..eb6c9f1d3a93 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = { ASSIGN_FETCH_TYPE(s16, u16, 1), ASSIGN_FETCH_TYPE(s32, u32, 1), ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), ASSIGN_FETCH_TYPE_END }; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 74e80a582c28..725af9dcbdff 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -36,24 +36,28 @@ const char *reserved_field_names[] = { }; /* Printing in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ -int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ +int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ void *data, void *ent) \ { \ trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ return !trace_seq_has_overflowed(s); \ } \ -const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") +const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld") +DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") /* Print type function for string type */ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 45400ca5ded1..f0c470a10edd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8); DECLARE_BASIC_PRINT_TYPE_FUNC(s16); DECLARE_BASIC_PRINT_TYPE_FUNC(s32); DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(x8); +DECLARE_BASIC_PRINT_TYPE_FUNC(x16); +DECLARE_BASIC_PRINT_TYPE_FUNC(x32); +DECLARE_BASIC_PRINT_TYPE_FUNC(x64); + DECLARE_BASIC_PRINT_TYPE_FUNC(string); #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type @@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) +/* If ptype is an alias of atype, use this macro (show atype in format) */ +#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) + #define ASSIGN_FETCH_TYPE_END {} #define FETCH_TYPE_STRING 0 diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c53485441c88..7a687320f867 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = { ASSIGN_FETCH_TYPE(s16, u16, 1), ASSIGN_FETCH_TYPE(s32, u32, 1), ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), ASSIGN_FETCH_TYPE_END }; -- cgit v1.2.3 From 864256255597aad86abcecbe6c53da8852ded15b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 18 Aug 2016 17:58:15 +0900 Subject: ftrace: probe: Add README entries for k/uprobe-events Add README entries for kprobe-events and uprobe-events. This allows user to check what options can be acceptable for running kernel. E.g. perf tools can choose correct types for the kernel. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Alexander Shishkin Cc: Hemant Kumar Cc: Naohiro Aota Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/147151069524.12957.12957179170304055028.stgit@devbox Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dade4c9559cc..1e2ce3b52e51 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4123,6 +4123,30 @@ static const char readme_msg[] = "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ +#ifdef CONFIG_KPROBE_EVENT + " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#ifdef CONFIG_UPROBE_EVENT + " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) + "\t accepts: event-definitions (one definition per line)\n" + "\t Format: p|r[:[/]] []\n" + "\t -:[/]\n" +#ifdef CONFIG_KPROBE_EVENT + "\t place: [:][+]|\n" +#endif +#ifdef CONFIG_UPROBE_EVENT + "\t place: :\n" +#endif + "\t args: =fetcharg[:type]\n" + "\t fetcharg: %, @
, @[+|-],\n" + "\t $stack, $stack, $retval, $comm\n" + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" + "\t b@/\n" +#endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events//\t- Directory containing all trace events for :\n" -- cgit v1.2.3 From bdca79c2bf40556b664c9b1c32aec103e9bdb4a9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 18 Aug 2016 17:59:21 +0900 Subject: ftrace: kprobe: uprobe: Show u8/u16/u32/u64 types in decimal Change kprobe/uprobe-tracer to show the arguments type-casted with u8/u16/u32/u64 in decimal digits instead of hexadecimal. To minimize compatibility issue, the arguments without type casting are typed by x64 (or x32 for 32bit arch) by default. Note: all arguments set by old perf probe without types are shown in decimal by default. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Alexander Shishkin Cc: Hemant Kumar Cc: Naohiro Aota Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/147151076135.12957.14684546093034343894.stgit@devbox Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_probe.c | 8 ++++---- kernel/trace/trace_probe.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 725af9dcbdff..8c0553d9afd3 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -46,10 +46,10 @@ int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); -DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu") DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d") DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d") DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d") diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f0c470a10edd..0c0ae54d44c6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -208,7 +208,7 @@ DEFINE_FETCH_##method(u32) \ DEFINE_FETCH_##method(u64) /* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t +#define __DEFAULT_FETCH_TYPE(t) x##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) -- cgit v1.2.3 From ba14a194a434ccc8f733e263ad2ce941e35e5787 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Aug 2016 02:35:21 -0700 Subject: fork: Add generic vmalloced stack support If CONFIG_VMAP_STACK=y is selected, kernel stacks are allocated with __vmalloc_node_range(). Grsecurity has had a similar feature (called GRKERNSEC_KSTACKOVERFLOW=y) for a long time. Signed-off-by: Andy Lutomirski Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/14c07d4fd173a5b117f51e8b939f9f4323e39899.1470907718.git.luto@kernel.org [ Minor edits. ] Signed-off-by: Ingo Molnar --- kernel/fork.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 52e725d4a866..9b85f6b2cdcd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long *stack) * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ -# if THREAD_SIZE >= PAGE_SIZE -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { +#ifdef CONFIG_VMAP_STACK + void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP | __GFP_HIGHMEM, + PAGE_KERNEL, + 0, node, + __builtin_return_address(0)); + + /* + * We can't call find_vm_area() in interrupt context, and + * free_thread_stack() can be called in interrupt context, + * so cache the vm_struct. + */ + if (stack) + tsk->stack_vm_area = find_vm_area(stack); + return stack; +#else struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; +#endif } -static inline void free_thread_stack(unsigned long *stack) +static inline void free_thread_stack(struct task_struct *tsk) { - __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); + if (task_stack_vm_area(tsk)) + vfree(tsk->stack); + else + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -181,9 +201,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_stack(unsigned long *stack) +static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, stack); + kmem_cache_free(thread_stack_cache, tsk->stack); } void thread_stack_cache_init(void) @@ -213,24 +233,47 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(unsigned long *stack, int account) +static void account_kernel_stack(struct task_struct *tsk, int account) { - /* All stack pages are in the same zone and belong to the same memcg. */ - struct page *first_page = virt_to_page(stack); + void *stack = task_stack_page(tsk); + struct vm_struct *vm = task_stack_vm_area(tsk); + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + + if (vm) { + int i; + + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_zone_page_state(page_zone(vm->pages[i]), + NR_KERNEL_STACK_KB, + PAGE_SIZE / 1024 * account); + } - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); + /* All stack pages belong to the same memcg. */ + memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } else { + /* + * All stack pages are in the same zone and belong to the + * same memcg. + */ + struct page *first_page = virt_to_page(stack); + + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat( - first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } } void free_task(struct task_struct *tsk) { - account_kernel_stack(tsk->stack, -1); + account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); - free_thread_stack(tsk->stack); + free_thread_stack(tsk); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -342,6 +385,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; unsigned long *stack; + struct vm_struct *stack_vm_area; int err; if (node == NUMA_NO_NODE) @@ -354,11 +398,23 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + stack_vm_area = task_stack_vm_area(tsk); + err = arch_dup_task_struct(tsk, orig); + + /* + * arch_dup_task_struct() clobbers the stack-related fields. Make + * sure they're properly initialized before using any stack-related + * functions again. + */ + tsk->stack = stack; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; +#endif + if (err) goto free_stack; - tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -390,14 +446,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(stack, 1); + account_kernel_stack(tsk, 1); kcov_task_init(tsk); return tsk; free_stack: - free_thread_stack(stack); + free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; -- cgit v1.2.3 From e4a744ef2fef5c803348b650a3a2d01da7797a9b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:55 -0500 Subject: ftrace: Remove CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST from config Make HAVE_FUNCTION_GRAPH_FP_TEST a normal define, independent from kconfig. This removes some config file pollution and simplifies the checking for the fp test. Suggested-by: Steven Rostedt Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2c4e5f05054d6d367f702fd153af7a0109dd5c81.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 5 ----- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f4b86e8ca1e7..ba3326785ca4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_GRAPH_FP_TEST - bool - help - See Documentation/trace/ftrace-design.txt - config HAVE_DYNAMIC_FTRACE bool help diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7363ccf79512..fc173cd9fbfd 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -204,7 +204,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it -- cgit v1.2.3 From daa460a88c09b26b68e8b017de589c217e901afb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:56 -0500 Subject: ftrace: Only allocate the ret_stack 'fp' field when needed This saves some memory when HAVE_FUNCTION_GRAPH_FP_TEST isn't defined. On x86_64 with newer versions of gcc which have -mfentry, it saves 400 bytes per task. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5c7747d9ea7b5cb47ef0a8ce8a6cea6bf7aa94bf.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index fc173cd9fbfd..0e03ed0eac68 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -171,7 +171,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST current->ret_stack[index].fp = frame_pointer; +#endif *depth = current->curr_ret_stack; return 0; -- cgit v1.2.3 From 9a7c348ba6a46f6270d4fe49577649dad5664fe7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:57 -0500 Subject: ftrace: Add return address pointer to ftrace_ret_stack Storing this value will help prevent unwinders from getting out of sync with the function graph tracer ret_stack. Now instead of needing a stateful iterator, they can compare the return address pointer to find the right ret_stack entry. Note that an array of 50 ftrace_ret_stack structs is allocated for every task. So when an arch implements this, it will add either 200 or 400 bytes of memory usage per task (depending on whether it's a 32-bit or 64-bit platform). Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a95cfcc39e8f26b89a430c56926af0bb217bc0a1.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0e03ed0eac68..f7212ec643e2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, /* Add a function return address to the trace stack on thread info.*/ int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer) + unsigned long frame_pointer, unsigned long *retp) { unsigned long long calltime; int index; @@ -173,6 +173,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, current->ret_stack[index].subtime = 0; #ifdef HAVE_FUNCTION_GRAPH_FP_TEST current->ret_stack[index].fp = frame_pointer; +#endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + current->ret_stack[index].retp = retp; #endif *depth = current->curr_ret_stack; -- cgit v1.2.3 From 223918e32a87c79ac55ca4aa513ba405ba4d57cd Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:58 -0500 Subject: ftrace: Add ftrace_graph_ret_addr() stack unwinding helpers When function graph tracing is enabled for a function, ftrace modifies the stack by replacing the original return address with the address of a hook function (return_to_handler). Stack unwinders need a way to get the original return address. Add an arch-independent helper function for that named ftrace_graph_ret_addr(). This adds two variations of the function: one depends on HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, and the other relies on an index state variable. The former is recommended because, in some cases, the latter can cause problems when the unwinder skips stack frames. It can get out of sync with the ret_stack index and wrong addresses can be reported for the stack trace. Once all arches have been ported to use HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, we can get rid of the distinction. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/36bd90f762fc5e5af3929e3797a68a64906421cf.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 58 ++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f7212ec643e2..0cbe38a844fa 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -284,6 +284,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/** + * ftrace_graph_ret_addr - convert a potentially modified stack return address + * to its original value + * + * This function can be called by stack unwinding code to convert a found stack + * return address ('ret') to its original value, in case the function graph + * tracer has modified it to be 'return_to_handler'. If the address hasn't + * been modified, the unchanged value of 'ret' is returned. + * + * 'idx' is a state variable which should be initialized by the caller to zero + * before the first call. + * + * 'retp' is a pointer to the return address on the stack. It's ignored if + * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + */ +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int index = task->curr_ret_stack; + int i; + + if (ret != (unsigned long)return_to_handler) + return ret; + + if (index < -1) + index += FTRACE_NOTRACE_DEPTH; + + if (index < 0) + return ret; + + for (i = 0; i <= index; i++) + if (task->ret_stack[i].retp == retp) + return task->ret_stack[i].ret; + + return ret; +} +#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int task_idx; + + if (ret != (unsigned long)return_to_handler) + return ret; + + task_idx = task->curr_ret_stack; + + if (!task->ret_stack || task_idx < *idx) + return ret; + + task_idx -= *idx; + (*idx)++; + + return task->ret_stack[task_idx].ret; +} +#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ + int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, -- cgit v1.2.3 From b2d4c2edb2e4f89aaf85449dee3b87fbf0f8a4d4 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 18 Aug 2016 18:41:00 +0200 Subject: locking/hung_task: Show all locks When we get a hung task it can often be valuable to see _all_ the held locks on the system (in case we are being blocked on trying to acquire one), e.g. with this patch we can immediately see where the problem is below: INFO: task trinity-c3:14933 blocked for more than 120 seconds. Not tainted 4.8.0-rc1+ #135 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. trinity-c3 D ffff88010c16fc88 0 14933 1 0x00080004 ffff88010c16fc88 000000003b9aca00 0000000000000000 0000000000000296 00000000776cdf88 ffff88011a520ae0 ffff88011a520b08 ffff88011a520198 ffffffff867d7f00 ffff88011942c080 ffff880116841580 ffff88010c168000 Call Trace: [] schedule+0x77/0x230 [] __lock_sock+0x129/0x250 [] ? __sk_destruct+0x450/0x450 [] ? wake_bit_function+0x2e0/0x2e0 [] lock_sock_nested+0xeb/0x120 [] irda_setsockopt+0x65/0xb40 [] SyS_setsockopt+0x139/0x230 [] ? SyS_recv+0x20/0x20 [] ? trace_event_raw_event_sys_enter+0xb90/0xb90 [] ? __this_cpu_preempt_check+0x13/0x20 [] ? __context_tracking_exit.part.3+0x30/0x1b0 [] ? SyS_recv+0x20/0x20 [] do_syscall_64+0x1b3/0x4b0 [] entry_SYSCALL64_slow_path+0x25/0x25 Showing all locks held in the system: 2 locks held by khungtaskd/563: #0: (rcu_read_lock){......}, at: [] watchdog+0x106/0x910 #1: (tasklist_lock){......}, at: [] debug_show_all_locks+0x74/0x360 1 lock held by trinity-c0/19280: #0: (sk_lock-AF_IRDA){......}, at: [] irda_accept+0x176/0x10f0 1 lock held by trinity-c0/12865: #0: (sk_lock-AF_IRDA){......}, at: [] irda_accept+0x176/0x10f0 Signed-off-by: Vegard Nossum Cc: Andrew Morton Cc: Linus Torvalds Cc: Mandeep Singh Baines Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471538460-7505-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d234022805dc..432c3d71d195 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -117,7 +117,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_held_locks(t); + debug_show_all_locks(); touch_nmi_watchdog(); -- cgit v1.2.3 From 01175255fd8e3e993353a779f819ec8c0c59137e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:22 -0400 Subject: sched: Remove __schedule() non-standard frame annotation Now that the x86 switch_to() uses the standard C calling convention, the STACK_FRAME_NON_STANDARD() annotation is no longer needed. Suggested-by: Josh Poimboeuf Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-8-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a906f20fba7..3d91b63dd2f6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3381,7 +3381,6 @@ static void __sched notrace __schedule(bool preempt) balance_callback(rq); } -STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ static inline void sched_submit_work(struct task_struct *tsk) { -- cgit v1.2.3 From d391e552293399396c131544f5b1c2f9b1fb0baa Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 17 Aug 2016 13:50:25 +0100 Subject: cpu/hotplug: Allow suspend/resume CPU to be specified disable_nonboot_cpus() assumes that the lowest numbered online CPU is the boot CPU, and that this is the correct CPU to run any power management code on. On x86 this is always correct, as CPU0 cannot (easily) by taken offline. On arm64 CPU0 can be taken offline. For hibernate/resume this means we may hibernate on a CPU other than CPU0. If the system is rebooted with kexec 'CPU0' will be assigned to a different physical CPU. This complicates hibernate/resume as now we can't trust the CPU numbers. Arch code can find the correct physical CPU, and ensure it is online before resume from hibernate begins, but also needs to influence disable_nonboot_cpus()s choice of CPU. Rename disable_nonboot_cpus() as freeze_secondary_cpus() and add an argument indicating which CPU should be left standing. Follow the logic in migrate_to_reboot_cpu() to use the lowest numbered online CPU if the requested CPU is not online. Add disable_nonboot_cpus() as an inline function that has the existing behaviour. Cc: Rafael J. Wysocki Reviewed-by: Thomas Gleixner Signed-off-by: James Morse Signed-off-by: Will Deacon --- kernel/cpu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 341bf80f80bd..ebbf027dd4a1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1024,12 +1024,13 @@ EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int disable_nonboot_cpus(void) +int freeze_secondary_cpus(int primary) { - int cpu, first_cpu, error = 0; + int cpu, error = 0; cpu_maps_update_begin(); - first_cpu = cpumask_first(cpu_online_mask); + if (!cpu_online(primary)) + primary = cpumask_first(cpu_online_mask); /* * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time @@ -1038,7 +1039,7 @@ int disable_nonboot_cpus(void) pr_info("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { - if (cpu == first_cpu) + if (cpu == primary) continue; trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1, CPUHP_OFFLINE); -- cgit v1.2.3 From 2992ef29ae01af998399d55ed7c692a2505fb8af Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 25 Aug 2016 10:04:45 -0500 Subject: livepatch/module: make TAINT_LIVEPATCH module-specific There's no reliable way to determine which module tainted the kernel with TAINT_LIVEPATCH. For example, /sys/module//taint doesn't report it. Neither does the "mod -t" command in the crash tool. Make it crystal clear who the guilty party is by associating TAINT_LIVEPATCH with any module which sets the "livepatch" modinfo attribute. The flag will still get set in the kernel like before, but now it also sets the same flag in mod->taint. Note that now the taint flag gets set when the module is loaded rather than when it's enabled. I also renamed find_livepatch_modinfo() to check_modinfo_livepatch() to better reflect its purpose: it's basically a livepatch-specific sub-function of check_modinfo(). Reported-by: Chunyu Hu Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Acked-by: Jessica Yu Acked-by: Rusty Russell Signed-off-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 3 --- kernel/module.c | 13 +++++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5fbabe022286..af4643873e71 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -545,9 +545,6 @@ static int __klp_enable_patch(struct klp_patch *patch) list_prev_entry(patch, list)->state == KLP_DISABLED) return -EBUSY; - pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); - add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); - pr_notice("enabling patch '%s'\n", patch->mod->name); klp_for_each_object(patch, obj) { diff --git a/kernel/module.c b/kernel/module.c index 529efae9f481..f57dd63186e6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1149,6 +1149,8 @@ static size_t module_flags_taint(struct module *mod, char *buf) buf[l++] = 'C'; if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) buf[l++] = 'E'; + if (mod->taints & (1 << TAINT_LIVEPATCH)) + buf[l++] = 'K'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't @@ -2792,14 +2794,17 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l } #ifdef CONFIG_LIVEPATCH -static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo_livepatch(struct module *mod, struct load_info *info) { - mod->klp = get_modinfo(info, "livepatch") ? true : false; + if (get_modinfo(info, "livepatch")) { + mod->klp = true; + add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + } return 0; } #else /* !CONFIG_LIVEPATCH */ -static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo_livepatch(struct module *mod, struct load_info *info) { if (get_modinfo(info, "livepatch")) { pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", @@ -2969,7 +2974,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) "is unknown, you have been warned.\n", mod->name); } - err = find_livepatch_modinfo(mod, info); + err = check_modinfo_livepatch(mod, info); if (err) return err; -- cgit v1.2.3 From f72b8792d180948b4b3898374998f5ac8c02e539 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 24 Aug 2016 15:51:50 -0600 Subject: workqueue: add cancel_work() Like cancel_delayed_work(), but for regular work. Signed-off-by: Jens Axboe Mehed-by: Tejun Heo Acked-by: Tejun Heo --- kernel/workqueue.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef071ca73fc3..bd81f0390277 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +static bool __cancel_work(struct work_struct *work, bool is_dwork) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(work, is_dwork, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_pool_and_clear_pending(work, get_work_pool_id(work)); + local_irq_restore(flags); + return ret; +} + +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work); */ bool cancel_delayed_work(struct delayed_work *dwork) { - unsigned long flags; - int ret; - - do { - ret = try_to_grab_pending(&dwork->work, true, &flags); - } while (unlikely(ret == -EAGAIN)); - - if (unlikely(ret < 0)) - return false; - - set_work_pool_and_clear_pending(&dwork->work, - get_work_pool_id(&dwork->work)); - local_irq_restore(flags); - return ret; + return __cancel_work(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work); -- cgit v1.2.3 From fa2bea2f5cca5b8d4a3e5520d2e8c0ede67ac108 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 30 Aug 2016 17:19:13 -0400 Subject: audit: consistently record PIDs with task_tgid_nr() Unfortunately we record PIDs in audit records using a variety of methods despite the correct way being the use of task_tgid_nr(). This patch converts all of these callers, except for the case of AUDIT_SET in audit_receive_msg() (see the comment in the code). Reported-by: Jeff Vander Stoep Signed-off-by: Paul Moore --- kernel/audit.c | 8 +++++++- kernel/auditsc.c | 12 ++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8d528f9930da..02bde12685bd 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -877,6 +877,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } if (s.mask & AUDIT_STATUS_PID) { + /* NOTE: we are using task_tgid_vnr() below because + * the s.pid value is relative to the namespace + * of the caller; at present this doesn't matter + * much since you can really only run auditd + * from the initial pid namespace, but something + * to keep in mind if this changes */ int new_pid = s.pid; pid_t requesting_pid = task_tgid_vnr(current); @@ -1917,7 +1923,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(tsk), - task_pid_nr(tsk), + task_tgid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2672d105cffc..3824b1bbeae1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -455,7 +455,7 @@ static int audit_filter_rules(struct task_struct *tsk, switch (f->type) { case AUDIT_PID: - pid = task_pid_nr(tsk); + pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: @@ -1993,7 +1993,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, loginuid = from_kuid(&init_user_ns, kloginuid), tty = audit_get_tty(current); - audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", @@ -2220,7 +2220,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = task_pid_nr(t); + context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2245,7 +2245,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = task_pid_nr(tsk); + audit_sig_pid = task_tgid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2345,7 +2345,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = current->audit_context; - context->capset.pid = task_pid_nr(current); + context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; @@ -2377,7 +2377,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); + audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } -- cgit v1.2.3 From 537f7ccb396804c6d0057b93ba8eb104ba44f851 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Aug 2016 14:37:37 -0500 Subject: mntns: Add a limit on the number of mount namespaces. v2: Fixed the very obvious lack of setting ucounts on struct mnt_ns reported by Andrei Vagin, and the kbuild test report. Reported-by: Andrei Vagin Acked-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/ucount.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 205f1a07faac..9d20d5dd298a 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -72,6 +72,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_uts_namespaces"), UCOUNT_ENTRY("max_ipc_namespaces"), UCOUNT_ENTRY("max_net_namespaces"), + UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), { } }; -- cgit v1.2.3 From b4d90e9f1ef1f19dcb2b1b1942c786c9c4225460 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Thu, 23 Jun 2016 20:50:37 +0200 Subject: hrtimer: Spelling fixes Fix a minor spelling error. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Pratyush Patel [jstultz: Added commit message] Signed-off-by: John Stultz --- kernel/time/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9ba7c820fc23..252ea4741592 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -703,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work) static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* - * Called from timekeeping and resume code to reprogramm the hrtimer + * Called from timekeeping and resume code to reprogram the hrtimer * interrupt device on all cpus. */ void clock_was_set_delayed(void) @@ -1241,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, /* * Note: We clear the running state after enqueue_hrtimer and - * we do not reprogramm the event hardware. Happens either in + * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, -- cgit v1.2.3 From 36374583f9084cdab4b5dcf5521a3ce55bebb9fa Mon Sep 17 00:00:00 2001 From: Kyle Walker Date: Sat, 6 Aug 2016 12:07:30 -0400 Subject: clocksource: Defer override invalidation unless clock is unstable Clocksources don't get the VALID_FOR_HRES flag until they have been checked by a watchdog. However, when using an override, the clocksource_select logic will clear the override value if the clocksource is not marked VALID_FOR_HRES during that inititial check. When using the boot arguments clocksource=, this selection can run before the watchdog, and can cause the override to be incorrectly cleared. To address this condition, the override_name is only invalidated for unstable clocksources. Otherwise, the override is left intact until after the watchdog has validated the clocksource as stable/unstable. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Martin Schwidefsky Signed-off-by: Kyle Walker Signed-off-by: John Stultz --- kernel/time/clocksource.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a5a310a1a53..7e4fad75acaa 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -600,9 +600,18 @@ static void __clocksource_select(bool skipcur) */ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ - pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", - cs->name); - override_name[0] = 0; + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); + override_name[0] = 0; + } else { + /* + * The override cannot be currently verified. + * Deferring to let the watchdog check. + */ + pr_info("Override clocksource %s is not currently HRT compatible - deferring\n", + cs->name); + } } else /* Override clocksource can be used. */ best = cs; -- cgit v1.2.3 From 0bf43f15db857e83daf4134aa062c8b157a80ee0 Mon Sep 17 00:00:00 2001 From: Ruchi Kandoi Date: Thu, 11 Aug 2016 14:35:01 -0700 Subject: timekeeping: Prints the amounts of time spent during suspend In addition to keeping a histogram of suspend times, also print out the time spent in suspend to dmesg. This helps to keep track of suspend time while debugging using kernel logs. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Ruchi Kandoi [jstultz: Tweaked commit message] Signed-off-by: John Stultz --- kernel/time/timekeeping_debug.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 107310a6f36f..ca9fb800336b 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -75,5 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t) int bin = min(fls(t->tv_sec), NUM_BINS-1); sleep_time_bin[bin]++; + pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec, + t->tv_nsec / NSEC_PER_MSEC); } -- cgit v1.2.3 From 469e857f374640f6164913835ce30d0736b40a60 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 12 Aug 2016 20:14:09 +0200 Subject: time: Avoid undefined behaviour in timespec64_add_safe() I ran into this: ================================================================================ UBSAN: Undefined behaviour in kernel/time/time.c:783:2 signed integer overflow: 5273 + 9223372036854771711 cannot be represented in type 'long int' CPU: 0 PID: 17363 Comm: trinity-c0 Not tainted 4.8.0-rc1+ #88 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 0000000000000000 ffff88011457f8f0 ffffffff82344f50 0000000041b58ab3 ffffffff84f98080 ffffffff82344ea4 ffff88011457f918 ffff88011457f8c8 ffff88011457f8e0 7fffffffffffefff ffff88011457f6d8 dffffc0000000000 Call Trace: [] dump_stack+0xac/0xfc [] ? _atomic_dec_and_lock+0xc4/0xc4 [] ubsan_epilogue+0xd/0x8a [] handle_overflow+0x202/0x23d [] ? val_to_string.constprop.6+0x11e/0x11e [] ? debug_smp_processor_id+0x17/0x20 [] ? __sigqueue_free.part.13+0x51/0x70 [] ? rcu_is_watching+0x110/0x110 [] __ubsan_handle_add_overflow+0xe/0x10 [] timespec64_add_safe+0x298/0x340 [] ? timespec_add_safe+0x330/0x330 [] ? wait_noreap_copyout+0x1d0/0x1d0 [] poll_select_set_timeout+0xf8/0x170 [] ? poll_schedule_timeout+0x2b0/0x2b0 [] ? __might_sleep+0x5b/0x260 [] __sys_recvmmsg+0x107/0x790 [] ? SyS_recvmsg+0x20/0x20 [] ? hrtimer_start_range_ns+0x3b8/0x1380 [] ? _raw_spin_unlock_irqrestore+0x3b/0x60 [] ? do_setitimer+0x39a/0x8e0 [] ? __might_sleep+0x5b/0x260 [] ? __sys_recvmmsg+0x790/0x790 [] SyS_recvmmsg+0xd9/0x160 [] ? __sys_recvmmsg+0x790/0x790 [] ? __this_cpu_preempt_check+0x13/0x20 [] ? __context_tracking_exit.part.3+0x30/0x1b0 [] ? __sys_recvmmsg+0x790/0x790 [] do_syscall_64+0x1b3/0x4b0 [] entry_SYSCALL64_slow_path+0x25/0x25 ================================================================================ Line 783 is this: 783 set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, 784 lhs.tv_nsec + rhs.tv_nsec); In other words, since lhs.tv_sec and rhs.tv_sec are both time64_t, this is a signed addition which will cause undefined behaviour on overflow. Note that this is not currently a huge concern since the kernel should be built with -fno-strict-overflow by default, but could be a problem in the future, a problem with older compilers, or other compilers than gcc. The easiest way to avoid the overflow is to cast one of the arguments to unsigned (so the addition will be done using unsigned arithmetic). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Vegard Nossum Signed-off-by: John Stultz --- kernel/time/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 667b9335f5d6..bd62fb8e8e77 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -780,7 +780,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, { struct timespec64 res; - set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { -- cgit v1.2.3 From 979515c5645830465739254abc1b1648ada41518 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 13 Aug 2016 01:37:04 +0200 Subject: time: Avoid undefined behaviour in ktime_add_safe() I ran into this: ================================================================================ UBSAN: Undefined behaviour in kernel/time/hrtimer.c:310:16 signed integer overflow: 9223372036854775807 + 50000 cannot be represented in type 'long long int' CPU: 2 PID: 4798 Comm: trinity-c2 Not tainted 4.8.0-rc1+ #91 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 0000000000000000 ffff88010ce6fb88 ffffffff82344740 0000000041b58ab3 ffffffff84f97a20 ffffffff82344694 ffff88010ce6fbb0 ffff88010ce6fb60 000000000000c350 ffff88010ce6f968 dffffc0000000000 ffffffff857bc320 Call Trace: [] dump_stack+0xac/0xfc [] ? _atomic_dec_and_lock+0xc4/0xc4 [] ubsan_epilogue+0xd/0x8a [] handle_overflow+0x202/0x23d [] ? val_to_string.constprop.6+0x11e/0x11e [] ? timerqueue_add+0x151/0x410 [] ? hrtimer_start_range_ns+0x3b8/0x1380 [] ? memset+0x31/0x40 [] __ubsan_handle_add_overflow+0xe/0x10 [] hrtimer_nanosleep+0x5d9/0x790 [] ? hrtimer_init_sleeper+0x80/0x80 [] ? __might_sleep+0x5b/0x260 [] common_nsleep+0x20/0x30 [] SyS_clock_nanosleep+0x197/0x210 [] ? SyS_clock_getres+0x150/0x150 [] ? __this_cpu_preempt_check+0x13/0x20 [] ? __context_tracking_exit.part.3+0x30/0x1b0 [] ? SyS_clock_getres+0x150/0x150 [] do_syscall_64+0x1b3/0x4b0 [] entry_SYSCALL64_slow_path+0x25/0x25 ================================================================================ Add a new ktime_add_unsafe() helper which doesn't check for overflow, but doesn't throw a UBSAN warning when it does overflow either. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Vegard Nossum Signed-off-by: John Stultz --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 252ea4741592..bb5ec425dfe0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns); */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { - ktime_t res = ktime_add(lhs, rhs); + ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can -- cgit v1.2.3 From 5ba8a4a96f6eaa6af88e24c7794f142217aa3b6f Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 25 Aug 2016 18:21:09 +0300 Subject: tracing/uprobe: Drop isdigit() check in create_trace_uprobe It's useless. Before: [tracing]# echo 'p:test /a:0x0' >> uprobe_events [tracing]# echo 'p:test a:0x0' >> uprobe_events -bash: echo: write error: No such file or directory [tracing]# echo 'p:test 1:0x0' >> uprobe_events -bash: echo: write error: Invalid argument After: [tracing]# echo 'p:test 1:0x0' >> uprobe_events -bash: echo: write error: No such file or directory Link: http://lkml.kernel.org/r/20160825152110.25663-3-dsafonov@virtuozzo.com Acked-by: Srikar Dronamraju Acked-by: Oleg Nesterov Signed-off-by: Dmitry Safonov Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c53485441c88..a74f2d9ff379 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -427,10 +427,6 @@ static int create_trace_uprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - if (isdigit(argv[1][0])) { - pr_info("probe point must be have a filename.\n"); - return -EINVAL; - } arg = strchr(argv[1], ':'); if (!arg) { ret = -EINVAL; -- cgit v1.2.3 From 613dccdf681aed9f9d1243bb2b8cd864a887802f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 1 Sep 2016 11:43:54 +0900 Subject: function_graph: Handle TRACE_BPUTS in print_graph_comment It missed to handle TRACE_BPUTS so messages recorded by trace_bputs() will be shown with symbol info unnecessarily. You can see it with the trace_printk sample code: # cd /sys/kernel/tracing/ # echo sys_sync > set_graph_function # echo 1 > options/sym-offset # echo function_graph > current_tracer Note that the sys_sync filter was there to prevent recording other functions and the sym-offset option was needed since the first message was called from a module init function so kallsyms doesn't have the symbol and omitted in the output. # cd ~/build/kernel # insmod samples/trace_printk/trace-printk.ko # cd - # head trace Before: # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 1) | /* 0xffffffffa0002000: This is a static string that will use trace_bputs */ 1) | /* This is a dynamic string that will use trace_puts */ 1) | /* trace_printk_irq_work+0x5/0x7b [trace_printk]: (irq) This is a static string that will use trace_bputs */ 1) | /* (irq) This is a dynamic string that will use trace_puts */ 1) | /* (irq) This is a static string that will use trace_bprintk() */ 1) | /* (irq) This is a dynamic string that will use trace_printk */ After: # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 1) | /* This is a static string that will use trace_bputs */ 1) | /* This is a dynamic string that will use trace_puts */ 1) | /* (irq) This is a static string that will use trace_bputs */ 1) | /* (irq) This is a dynamic string that will use trace_puts */ 1) | /* (irq) This is a static string that will use trace_bprintk() */ 1) | /* (irq) This is a dynamic string that will use trace_printk */ Link: http://lkml.kernel.org/r/20160901024354.13720-1-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7363ccf79512..e14017c36170 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1120,6 +1120,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, trace_seq_puts(s, "/* "); switch (iter->ent->type) { + case TRACE_BPUTS: + ret = trace_print_bputs_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; case TRACE_BPRINT: ret = trace_print_bprintk_msg_only(iter); if (ret != TRACE_TYPE_HANDLED) -- cgit v1.2.3 From 8861dd303cba879bae9a9dcee74042fb642bf03b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 31 Aug 2016 11:55:29 +0900 Subject: ftrace: Access ret_stack->subtime only in the function profiler The subtime is used only for function profiler with function graph tracer enabled. Move the definition of subtime under CONFIG_FUNCTION_PROFILER to reduce the memory usage. Also move the initialization of subtime into the graph entry callback. Link: http://lkml.kernel.org/r/20160831025529.24018-1-namhyung@kernel.org Cc: Ingo Molnar Cc: Josh Poimboeuf Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++++++ kernel/trace/trace_functions_graph.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84752c8e28b5..2050a7652a86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { + int index = trace->depth; + function_profile_call(trace->func, 0, NULL, NULL); + + if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) + current->ret_stack[index].subtime = 0; + return 1; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e14017c36170..148c90f1e49b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -170,7 +170,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; - current->ret_stack[index].subtime = 0; current->ret_stack[index].fp = frame_pointer; *depth = current->curr_ret_stack; -- cgit v1.2.3 From fc590c22f9f056ab50190b797f6cacead29f9b75 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Sep 2016 14:45:19 +0200 Subject: genirq: Robustify handle_percpu_devid_irq() The percpu_devid handler is not robust against spurious interrupts. If a spurious interrupt happens and no action is installed then the handler crashes with a NULL pointer dereference. Add a sanity check for this and log the wreckage once in dmesg. Reported-by: Majun Signed-off-by: Thomas Gleixner Cc: Mark Rutland Cc: Marc Zyngier Cc: guohanjun@huawei.com Cc: dingtianhong@huawei.com Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1609021436160.5647@nanos --- kernel/irq/chip.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b4c1bc7c9ca2..93c373a8b12b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -756,7 +756,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; - void *dev_id = raw_cpu_ptr(action->percpu_dev_id); unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; @@ -765,9 +764,20 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_ack) chip->irq_ack(&desc->irq_data); - trace_irq_handler_entry(irq, action); - res = action->handler(irq, dev_id); - trace_irq_handler_exit(irq, action, res); + if (likely(action)) { + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); + } else { + unsigned int cpu = smp_processor_id(); + bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); + + if (enabled) + irq_percpu_disable(desc, cpu); + + pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n", + enabled ? " and unmasked" : "", irq, cpu); + } if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); -- cgit v1.2.3 From 48e0fba842c7daab80f3351398146368c5504a27 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Sep 2016 17:30:35 +0200 Subject: genirq: Remove export of irq_map_generic_chip() No module users. Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index abd286afbd27..5fbb94b077b3 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -409,7 +409,6 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } -EXPORT_SYMBOL_GPL(irq_map_generic_chip); struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, -- cgit v1.2.3 From f0c450eaa364cb77c778f2a46ee2aa3ff464b332 Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Mon, 1 Aug 2016 16:27:53 +0200 Subject: genirq/generic_chip: Get rid of code duplication irq_map_generic_chip() contains about the same code as irq_get_domain_generic_chip() except for the return values. Split out the irq_get_domain_generic_chip() implementation so it can be reused. [ tglx: Removed the extra churn in irq_get_domain_generic_chip() callers and massaged changelog ] Signed-off-by: Sebastian Frias Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/579F5C69.8070006@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 5fbb94b077b3..11ad73b39d2e 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -328,6 +328,20 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, } EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); +static struct irq_chip_generic * +__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) +{ + struct irq_domain_chip_generic *dgc = d->gc; + int idx; + + if (!dgc) + return ERR_PTR(-ENODEV); + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return ERR_PTR(-EINVAL); + return dgc->gc[idx]; +} + /** * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq * @d: irq domain pointer @@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); struct irq_chip_generic * irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) { - struct irq_domain_chip_generic *dgc = d->gc; - int idx; + struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq); - if (!dgc) - return NULL; - idx = hw_irq / dgc->irqs_per_chip; - if (idx >= dgc->num_chips) - return NULL; - return dgc->gc[idx]; + return !IS_ERR(gc) ? gc : NULL; } EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); @@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, unsigned long flags; int idx; - if (!d->gc) - return -ENODEV; - - idx = hw_irq / dgc->irqs_per_chip; - if (idx >= dgc->num_chips) - return -EINVAL; - gc = dgc->gc[idx]; + gc = __irq_get_domain_generic_chip(d, hw_irq); + if (IS_ERR(gc)) + return PTR_ERR(gc); idx = hw_irq % dgc->irqs_per_chip; -- cgit v1.2.3 From ee26c013cdee0b947e29d6cadfb9ff3341c69ff9 Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Mon, 1 Aug 2016 16:27:38 +0200 Subject: genirq/generic_chip: Add irq_unmap callback Without this patch irq_domain_disassociate() cannot properly release the interrupt. In fact, irq_map_generic_chip() checks a bit on 'gc->installed' but said bit is never cleared, only set. Commit 088f40b7b027 ("genirq: Generic chip: Add linear irq domain support") added irq_map_generic_chip() function and also stated "This lacks a removal function for now". This commit provides an implementation of an unmap function that can be called by irq_domain_disassociate(). [ tglx: Made the function static and removed the export as we have neither a prototype nor a modular user. ] Fixes: 088f40b7b027 ("genirq: Generic chip: Add linear irq domain support") Signed-off-by: Sebastian Frias Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/579F5C5A.2070507@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 11ad73b39d2e..a3a392097804 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -414,8 +414,29 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, return 0; } +static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_domain_get_irq_data(d, virq); + struct irq_domain_chip_generic *dgc = d->gc; + unsigned int hw_irq = data->hwirq; + struct irq_chip_generic *gc; + int irq_idx; + + gc = irq_get_domain_generic_chip(d, hw_irq); + if (!gc) + return; + + irq_idx = hw_irq % dgc->irqs_per_chip; + + clear_bit(irq_idx, &gc->installed); + irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL, + NULL); + +} + struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, + .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_generic_chip_ops); -- cgit v1.2.3 From 0c228919e04ddec195402296e7ebf2472ed6caef Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Tue, 2 Aug 2016 10:52:45 +0200 Subject: irqdomain: Mask irq type in irq_domain_xlate_onetwocell() According to the xlate() callback definition, the 'out_type' parameter needs to be the "linux irq type". A mask for such bits exists, IRQ_TYPE_SENSE_MASK, which is correctly applied in irq_domain_xlate_twocell() So use it for irq_domain_xlate_onetwocell() as well. Signed-off-by: Sebastian Frias Cc: Grant Likely Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/57A05F5D.103@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4752b43662e0..f10cffe8aefb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -868,7 +868,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; - *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; + if (intsize > 1) + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + else + *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); -- cgit v1.2.3 From e7c15cd8a113335cf7154f027c9c8da1a92238ee Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 23 Jun 2016 12:45:36 -0400 Subject: tracing: Added hardware latency tracer The hardware latency tracer has been in the PREEMPT_RT patch for some time. It is used to detect possible SMIs or any other hardware interruptions that the kernel is unaware of. Note, NMIs may also be detected, but that may be good to note as well. The logic is pretty simple. It simply creates a thread that spins on a single CPU for a specified amount of time (width) within a periodic window (window). These numbers may be adjusted by their cooresponding names in /sys/kernel/tracing/hwlat_detector/ The defaults are window = 1000000 us (1 second) width = 500000 us (1/2 second) The loop consists of: t1 = trace_clock_local(); t2 = trace_clock_local(); Where trace_clock_local() is a variant of sched_clock(). The difference of t2 - t1 is recorded as the "inner" timestamp and also the timestamp t1 - prev_t2 is recorded as the "outer" timestamp. If either of these differences are greater than the time denoted in /sys/kernel/tracing/tracing_thresh then it records the event. When this tracer is started, and tracing_thresh is zero, it changes to the default threshold of 10 us. The hwlat tracer in the PREEMPT_RT patch was originally written by Jon Masters. I have modified it quite a bit and turned it into a tracer. Based-on-code-by: Jon Masters Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 35 +++ kernel/trace/Makefile | 1 + kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 3 + kernel/trace/trace_entries.h | 23 ++ kernel/trace/trace_hwlat.c | 527 +++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_output.c | 52 +++++ 7 files changed, 642 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/trace_hwlat.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f4b86e8ca1e7..72c07c2ffd79 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -221,6 +221,41 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. +config HWLAT_TRACER + bool "Tracer to detect hardware latencies (like SMIs)" + select GENERIC_TRACER + help + This tracer, when enabled will create one or more kernel threads, + depening on what the cpumask file is set to, which each thread + spinning in a loop looking for interruptions caused by + something other than the kernel. For example, if a + System Management Interrupt (SMI) takes a noticeable amount of + time, this tracer will detect it. This is useful for testing + if a system is reliable for Real Time tasks. + + Some files are created in the tracing directory when this + is enabled: + + hwlat_detector/width - time in usecs for how long to spin for + hwlat_detector/window - time in usecs between the start of each + iteration + + A kernel thread is created that will spin with interrupts disabled + for "width" microseconds in every "widow" cycle. It will not spin + for "window - width" microseconds, where the system can + continue to operate. + + The output will appear in the trace and trace_pipe files. + + When the tracer is not running, it has no affect on the system, + but when it is running, it can cause the system to be + periodically non responsive. Do not run this tracer on a + production system. + + To enable this tracer, echo in "hwlat" into the current_tracer + file. Every time a latency is greater than tracing_thresh, it will + be recorded into the ring buffer. + config ENABLE_DEFAULT_TRACERS bool "Trace process context switches and events" depends on !GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d0a1617b52b4..992ab9d99f35 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dade4c9559cc..474cc814e16d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void) * * Shows real state of the ring buffer if it is enabled or not. */ -static int tracer_tracing_is_on(struct trace_array *tr) +int tracer_tracing_is_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) return ring_buffer_record_is_on(tr->trace_buffer.buffer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f783df416726..1d866b0c1567 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -38,6 +38,7 @@ enum trace_type { TRACE_USER_STACK, TRACE_BLK, TRACE_BPUTS, + TRACE_HWLAT, __TRACE_LAST_TYPE, }; @@ -326,6 +327,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ + IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ @@ -571,6 +573,7 @@ void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); +int tracer_tracing_is_on(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 5c30efcda5e6..70d47dd99359 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -322,3 +322,26 @@ FTRACE_ENTRY(branch, trace_branch, FILTER_OTHER ); + +FTRACE_ENTRY(hwlat, hwlat_entry, + + TRACE_HWLAT, + + F_STRUCT( + __field( u64, duration ) + __field( u64, outer_duration ) + __field_struct( struct timespec, timestamp ) + __field_desc( long, timestamp, tv_sec ) + __field_desc( long, timestamp, tv_nsec ) + __field( unsigned int, seqnum ) + ), + + F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llu\n", + __entry->seqnum, + __entry->tv_sec, + __entry->tv_nsec, + __entry->duration, + __entry->outer_duration), + + FILTER_OTHER +); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c new file mode 100644 index 000000000000..08dfabe4e862 --- /dev/null +++ b/kernel/trace/trace_hwlat.c @@ -0,0 +1,527 @@ +/* + * trace_hwlatdetect.c - A simple Hardware Latency detector. + * + * Use this tracer to detect large system latencies induced by the behavior of + * certain underlying system hardware or firmware, independent of Linux itself. + * The code was developed originally to detect the presence of SMIs on Intel + * and AMD systems, although there is no dependency upon x86 herein. + * + * The classical example usage of this tracer is in detecting the presence of + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a + * somewhat special form of hardware interrupt spawned from earlier CPU debug + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge + * LPC (or other device) to generate a special interrupt under certain + * circumstances, for example, upon expiration of a special SMI timer device, + * due to certain external thermal readings, on certain I/O address accesses, + * and other situations. An SMI hits a special CPU pin, triggers a special + * SMI mode (complete with special memory map), and the OS is unaware. + * + * Although certain hardware-inducing latencies are necessary (for example, + * a modern system often requires an SMI handler for correct thermal control + * and remote management) they can wreak havoc upon any OS-level performance + * guarantees toward low-latency, especially when the OS is not even made + * aware of the presence of these interrupts. For this reason, we need a + * somewhat brute force mechanism to detect these interrupts. In this case, + * we do it by hogging all of the CPU(s) for configurable timer intervals, + * sampling the built-in CPU timer, looking for discontiguous readings. + * + * WARNING: This implementation necessarily introduces latencies. Therefore, + * you should NEVER use this tracer while running in a production + * environment requiring any kind of low-latency performance + * guarantee(s). + * + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. + * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. + * + * Includes useful feedback from Clark Williams + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ +#include +#include +#include +#include +#include "trace.h" + +static struct trace_array *hwlat_trace; + +#define U64STR_SIZE 22 /* 20 digits max */ + +#define BANNER "hwlat_detector: " +#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ +#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ +#define DEFAULT_LAT_THRESHOLD 10 /* 10us */ + +/* sampling thread*/ +static struct task_struct *hwlat_kthread; + +static struct dentry *hwlat_sample_width; /* sample width us */ +static struct dentry *hwlat_sample_window; /* sample window us */ + +/* Save the previous tracing_thresh value */ +static unsigned long save_tracing_thresh; + +/* If the user changed threshold, remember it */ +static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; + +/* Individual latency samples are stored here when detected. */ +struct hwlat_sample { + u64 seqnum; /* unique sequence */ + u64 duration; /* delta */ + u64 outer_duration; /* delta (outer loop) */ + struct timespec timestamp; /* wall time */ +}; + +/* keep the global state somewhere. */ +static struct hwlat_data { + + struct mutex lock; /* protect changes */ + + u64 count; /* total since reset */ + + u64 sample_window; /* total sampling window (on+off) */ + u64 sample_width; /* active sampling portion of window */ + +} hwlat_data = { + .sample_window = DEFAULT_SAMPLE_WINDOW, + .sample_width = DEFAULT_SAMPLE_WIDTH, +}; + +static void trace_hwlat_sample(struct hwlat_sample *sample) +{ + struct trace_array *tr = hwlat_trace; + struct trace_event_call *call = &event_hwlat; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct hwlat_entry *entry; + unsigned long flags; + int pc; + + pc = preempt_count(); + local_save_flags(flags); + + event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), + flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->seqnum = sample->seqnum; + entry->duration = sample->duration; + entry->outer_duration = sample->outer_duration; + entry->timestamp = sample->timestamp; + + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); +} + +/* Macros to encapsulate the time capturing infrastructure */ +#define time_type u64 +#define time_get() trace_clock_local() +#define time_to_us(x) div_u64(x, 1000) +#define time_sub(a, b) ((a) - (b)) +#define init_time(a, b) (a = b) +#define time_u64(a) a + +/** + * get_sample - sample the CPU TSC and look for likely hardware latencies + * + * Used to repeatedly capture the CPU TSC (or similar), looking for potential + * hardware-induced latency. Called with interrupts disabled and with + * hwlat_data.lock held. + */ +static int get_sample(void) +{ + struct trace_array *tr = hwlat_trace; + time_type start, t1, t2, last_t2; + s64 diff, total, last_total = 0; + u64 sample = 0; + u64 thresh = tracing_thresh; + u64 outer_sample = 0; + int ret = -1; + + do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ + + init_time(last_t2, 0); + start = time_get(); /* start timestamp */ + + do { + + t1 = time_get(); /* we'll look for a discontinuity */ + t2 = time_get(); + + if (time_u64(last_t2)) { + /* Check the delta from outer loop (t2 to next t1) */ + diff = time_to_us(time_sub(t1, last_t2)); + /* This shouldn't happen */ + if (diff < 0) { + pr_err(BANNER "time running backwards\n"); + goto out; + } + if (diff > outer_sample) + outer_sample = diff; + } + last_t2 = t2; + + total = time_to_us(time_sub(t2, start)); /* sample width */ + + /* Check for possible overflows */ + if (total < last_total) { + pr_err("Time total overflowed\n"); + break; + } + last_total = total; + + /* This checks the inner loop (t1 to t2) */ + diff = time_to_us(time_sub(t2, t1)); /* current diff */ + + /* This shouldn't happen */ + if (diff < 0) { + pr_err(BANNER "time running backwards\n"); + goto out; + } + + if (diff > sample) + sample = diff; /* only want highest value */ + + } while (total <= hwlat_data.sample_width); + + ret = 0; + + /* If we exceed the threshold value, we have found a hardware latency */ + if (sample > thresh || outer_sample > thresh) { + struct hwlat_sample s; + + ret = 1; + + hwlat_data.count++; + s.seqnum = hwlat_data.count; + s.duration = sample; + s.outer_duration = outer_sample; + s.timestamp = CURRENT_TIME; + trace_hwlat_sample(&s); + + /* Keep a running maximum ever recorded hardware latency */ + if (sample > tr->max_latency) + tr->max_latency = sample; + } + +out: + return ret; +} + +/* + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread + * + * Used to periodically sample the CPU TSC via a call to get_sample. We + * disable interrupts, which does (intentionally) introduce latency since we + * need to ensure nothing else might be running (and thus preempting). + * Obviously this should never be used in production environments. + * + * Currently this runs on which ever CPU it was scheduled on, but most + * real-world hardware latency situations occur across several CPUs, + * but we might later generalize this if we find there are any actualy + * systems with alternate SMI delivery or other hardware latencies. + */ +static int kthread_fn(void *data) +{ + u64 interval; + + while (!kthread_should_stop()) { + + local_irq_disable(); + get_sample(); + local_irq_enable(); + + mutex_lock(&hwlat_data.lock); + interval = hwlat_data.sample_window - hwlat_data.sample_width; + mutex_unlock(&hwlat_data.lock); + + do_div(interval, USEC_PER_MSEC); /* modifies interval value */ + + /* Always sleep for at least 1ms */ + if (interval < 1) + interval = 1; + + if (msleep_interruptible(interval)) + break; + } + + return 0; +} + +/** + * start_kthread - Kick off the hardware latency sampling/detector kthread + * + * This starts the kernel thread that will sit and sample the CPU timestamp + * counter (TSC or similar) and look for potential hardware latencies. + */ +static int start_kthread(struct trace_array *tr) +{ + struct task_struct *kthread; + + kthread = kthread_create(kthread_fn, NULL, "hwlatd"); + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + return -ENOMEM; + } + hwlat_kthread = kthread; + wake_up_process(kthread); + + return 0; +} + +/** + * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * + * This kicks the running hardware latency sampling/detector kernel thread and + * tells it to stop sampling now. Use this on unload and at system shutdown. + */ +static void stop_kthread(void) +{ + if (!hwlat_kthread) + return; + kthread_stop(hwlat_kthread); + hwlat_kthread = NULL; +} + +/* + * hwlat_read - Wrapper read function for reading both window and width + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a generic read implementation for the global state + * "hwlat_data" structure filesystem entries. + */ +static ssize_t hwlat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[U64STR_SIZE]; + u64 *entry = filp->private_data; + u64 val; + int len; + + if (!entry) + return -EFAULT; + + if (cnt > sizeof(buf)) + cnt = sizeof(buf); + + val = *entry; + + len = snprintf(buf, sizeof(buf), "%llu\n", val); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +/** + * hwlat_width_write - Write function for "width" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "width" interface + * to the hardware latency detector. It can be used to configure + * for how many us of the total window us we will actively sample for any + * hardware-induced latency periods. Obviously, it is not possible to + * sample constantly and have the system respond to a sample reader, or, + * worse, without having the system appear to have gone out to lunch. It + * is enforced that width is less that the total window size. + */ +static ssize_t +hwlat_width_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + u64 val; + int err; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + mutex_lock(&hwlat_data.lock); + if (val < hwlat_data.sample_window) + hwlat_data.sample_width = val; + else + err = -EINVAL; + mutex_unlock(&hwlat_data.lock); + + if (err) + return err; + + return cnt; +} + +/** + * hwlat_window_write - Write function for "window" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "window" interface + * to the hardware latency detetector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. Can be used to write a new total window size. It + * is enfoced that any value written must be greater than the sample width + * size, or an error results. + */ +static ssize_t +hwlat_window_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + u64 val; + int err; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + mutex_lock(&hwlat_data.lock); + if (hwlat_data.sample_width < val) + hwlat_data.sample_window = val; + else + err = -EINVAL; + mutex_unlock(&hwlat_data.lock); + + if (err) + return err; + + return cnt; +} + +static const struct file_operations width_fops = { + .open = tracing_open_generic, + .read = hwlat_read, + .write = hwlat_width_write, +}; + +static const struct file_operations window_fops = { + .open = tracing_open_generic, + .read = hwlat_read, + .write = hwlat_window_write, +}; + +/** + * init_tracefs - A function to initialize the tracefs interface files + * + * This function creates entries in tracefs for "hwlat_detector". + * It creates the hwlat_detector directory in the tracing directory, + * and within that directory is the count, width and window files to + * change and view those values. + */ +static int init_tracefs(void) +{ + struct dentry *d_tracer; + struct dentry *top_dir; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return -ENOMEM; + + top_dir = tracefs_create_dir("hwlat_detector", d_tracer); + if (!top_dir) + return -ENOMEM; + + hwlat_sample_window = tracefs_create_file("window", 0640, + top_dir, + &hwlat_data.sample_window, + &window_fops); + if (!hwlat_sample_window) + goto err; + + hwlat_sample_width = tracefs_create_file("width", 0644, + top_dir, + &hwlat_data.sample_width, + &width_fops); + if (!hwlat_sample_width) + goto err; + + return 0; + + err: + tracefs_remove_recursive(top_dir); + return -ENOMEM; +} + +static void hwlat_tracer_start(struct trace_array *tr) +{ + int err; + + err = start_kthread(tr); + if (err) + pr_err(BANNER "Cannot start hwlat kthread\n"); +} + +static void hwlat_tracer_stop(struct trace_array *tr) +{ + stop_kthread(); +} + +static bool hwlat_busy; + +static int hwlat_tracer_init(struct trace_array *tr) +{ + /* Only allow one instance to enable this */ + if (hwlat_busy) + return -EBUSY; + + hwlat_trace = tr; + + hwlat_data.count = 0; + tr->max_latency = 0; + save_tracing_thresh = tracing_thresh; + + /* tracing_thresh is in nsecs, we speak in usecs */ + if (!tracing_thresh) + tracing_thresh = last_tracing_thresh; + + if (tracer_tracing_is_on(tr)) + hwlat_tracer_start(tr); + + hwlat_busy = true; + + return 0; +} + +static void hwlat_tracer_reset(struct trace_array *tr) +{ + stop_kthread(); + + /* the tracing threshold is static between runs */ + last_tracing_thresh = tracing_thresh; + + tracing_thresh = save_tracing_thresh; + hwlat_busy = false; +} + +static struct tracer hwlat_tracer __read_mostly = +{ + .name = "hwlat", + .init = hwlat_tracer_init, + .reset = hwlat_tracer_reset, + .start = hwlat_tracer_start, + .stop = hwlat_tracer_stop, + .allow_instances = true, +}; + +__init static int init_hwlat_tracer(void) +{ + int ret; + + mutex_init(&hwlat_data.lock); + + ret = register_tracer(&hwlat_tracer); + if (ret) + return ret; + + init_tracefs(); + + return 0; +} +late_initcall(init_hwlat_tracer); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0bb9cf2d53e6..d67a562df259 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1098,6 +1098,57 @@ static struct trace_event trace_user_stack_event = { .funcs = &trace_user_stack_funcs, }; +/* TRACE_HWLAT */ +static enum print_line_t +trace_hwlat_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct hwlat_entry *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld\n", + field->seqnum, + field->duration, + field->outer_duration, + field->timestamp.tv_sec, + field->timestamp.tv_nsec); + + return trace_handle_return(s); +} + + +static enum print_line_t +trace_hwlat_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct hwlat_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", + field->duration, + field->outer_duration, + field->timestamp.tv_sec, + field->timestamp.tv_nsec, + field->seqnum); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_hwlat_funcs = { + .trace = trace_hwlat_print, + .raw = trace_hwlat_raw, +}; + +static struct trace_event trace_hwlat_event = { + .type = TRACE_HWLAT, + .funcs = &trace_hwlat_funcs, +}; + /* TRACE_BPUTS */ static enum print_line_t trace_bputs_print(struct trace_iterator *iter, int flags, @@ -1233,6 +1284,7 @@ static struct trace_event *events[] __initdata = { &trace_bputs_event, &trace_bprint_event, &trace_print_event, + &trace_hwlat_event, NULL }; -- cgit v1.2.3 From 0330f7aa8ee63d0c435c0cb4e47ea06235ee4b7f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 15 Jul 2016 15:48:56 -0400 Subject: tracing: Have hwlat trace migrate across tracing_cpumask CPUs Instead of having the hwlat detector thread stay on one CPU, have it migrate across all the CPUs specified by tracing_cpumask. If the user modifies the thread's CPU affinity, the migration will stop until the next instance that the tracer is instantiated. The migration happens at the end of each window (period). Signed-off-by: Steven Rostedt --- kernel/trace/trace_hwlat.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 08dfabe4e862..65aab3914a56 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include "trace.h" @@ -211,6 +212,57 @@ out: return ret; } +static struct cpumask save_cpumask; +static bool disable_migrate; + +static void move_to_next_cpu(void) +{ + static struct cpumask *current_mask; + int next_cpu; + + if (disable_migrate) + return; + + /* Just pick the first CPU on first iteration */ + if (!current_mask) { + current_mask = &save_cpumask; + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + put_online_cpus(); + next_cpu = cpumask_first(current_mask); + goto set_affinity; + } + + /* + * If for some reason the user modifies the CPU affinity + * of this thread, than stop migrating for the duration + * of the current test. + */ + if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) + goto disable; + + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + next_cpu = cpumask_next(smp_processor_id(), current_mask); + put_online_cpus(); + + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(current_mask); + + set_affinity: + if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ + goto disable; + + cpumask_clear(current_mask); + cpumask_set_cpu(next_cpu, current_mask); + + sched_setaffinity(0, current_mask); + return; + + disable: + disable_migrate = true; +} + /* * kthread_fn - The CPU time sampling/hardware latency detection kernel thread * @@ -230,6 +282,8 @@ static int kthread_fn(void *data) while (!kthread_should_stop()) { + move_to_next_cpu(); + local_irq_disable(); get_sample(); local_irq_enable(); @@ -473,6 +527,7 @@ static int hwlat_tracer_init(struct trace_array *tr) hwlat_trace = tr; + disable_migrate = false; hwlat_data.count = 0; tr->max_latency = 0; save_tracing_thresh = tracing_thresh; -- cgit v1.2.3 From 7b2c86250122de316cbab8754050622ead04af39 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 4 Aug 2016 12:49:53 -0400 Subject: tracing: Add NMI tracing in hwlat detector As NMIs can also cause latency when interrupts are disabled, the hwlat detectory has no way to know if the latency it detects is from an NMI or an SMI or some other hardware glitch. As ftrace_nmi_enter/exit() funtions are no longer used (except for sh, which isn't supported anymore), I converted those to "arch_ftrace_nmi_enter/exit" and use ftrace_nmi_enter/exit() to check if hwlat detector is tracing or not, and if so, it calls into the hwlat utility. Since the hwlat detector only has a single kthread that is spinning with interrupts disabled, it marks what CPU it is on, and if the NMI callback happens on that CPU, it records the time spent in that NMI. This is added to the output that is generated by the hwlat detector as: #3 inner/outer(us): 9/9 ts:1470836488.206734548 #4 inner/outer(us): 0/8 ts:1470836497.140808588 #5 inner/outer(us): 0/6 ts:1470836499.140825168 nmi-total:5 nmi-count:1 #6 inner/outer(us): 9/9 ts:1470836501.140841748 All time is still tracked in microseconds. The NMI information is only shown when an NMI occurred during the sample. Signed-off-by: Steven Rostedt --- kernel/trace/trace_entries.h | 8 +++++-- kernel/trace/trace_hwlat.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_output.c | 16 +++++++++++++- 3 files changed, 72 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 70d47dd99359..d1cc37e78f99 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -330,18 +330,22 @@ FTRACE_ENTRY(hwlat, hwlat_entry, F_STRUCT( __field( u64, duration ) __field( u64, outer_duration ) + __field( u64, nmi_total_ts ) __field_struct( struct timespec, timestamp ) __field_desc( long, timestamp, tv_sec ) __field_desc( long, timestamp, tv_nsec ) + __field( unsigned int, nmi_count ) __field( unsigned int, seqnum ) ), - F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llu\n", + F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", __entry->seqnum, __entry->tv_sec, __entry->tv_nsec, __entry->duration, - __entry->outer_duration), + __entry->outer_duration, + __entry->nmi_total_ts, + __entry->nmi_count), FILTER_OTHER ); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 65aab3914a56..b97286c48735 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -64,6 +64,15 @@ static struct dentry *hwlat_sample_window; /* sample window us */ /* Save the previous tracing_thresh value */ static unsigned long save_tracing_thresh; +/* NMI timestamp counters */ +static u64 nmi_ts_start; +static u64 nmi_total_ts; +static int nmi_count; +static int nmi_cpu; + +/* Tells NMIs to call back to the hwlat tracer to record timestamps */ +bool trace_hwlat_callback_enabled; + /* If the user changed threshold, remember it */ static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; @@ -72,7 +81,9 @@ struct hwlat_sample { u64 seqnum; /* unique sequence */ u64 duration; /* delta */ u64 outer_duration; /* delta (outer loop) */ + u64 nmi_total_ts; /* Total time spent in NMIs */ struct timespec timestamp; /* wall time */ + int nmi_count; /* # NMIs during this sample */ }; /* keep the global state somewhere. */ @@ -112,6 +123,8 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->duration = sample->duration; entry->outer_duration = sample->outer_duration; entry->timestamp = sample->timestamp; + entry->nmi_total_ts = sample->nmi_total_ts; + entry->nmi_count = sample->nmi_count; if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); @@ -125,6 +138,26 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) #define init_time(a, b) (a = b) #define time_u64(a) a +void trace_hwlat_callback(bool enter) +{ + if (smp_processor_id() != nmi_cpu) + return; + + /* + * Currently trace_clock_local() calls sched_clock() and the + * generic version is not NMI safe. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { + if (enter) + nmi_ts_start = time_get(); + else + nmi_total_ts = time_get() - nmi_ts_start; + } + + if (enter) + nmi_count++; +} + /** * get_sample - sample the CPU TSC and look for likely hardware latencies * @@ -144,6 +177,14 @@ static int get_sample(void) do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ + nmi_cpu = smp_processor_id(); + nmi_total_ts = 0; + nmi_count = 0; + /* Make sure NMIs see this first */ + barrier(); + + trace_hwlat_callback_enabled = true; + init_time(last_t2, 0); start = time_get(); /* start timestamp */ @@ -188,6 +229,10 @@ static int get_sample(void) } while (total <= hwlat_data.sample_width); + barrier(); /* finish the above in the view for NMIs */ + trace_hwlat_callback_enabled = false; + barrier(); /* Make sure nmi_total_ts is no longer updated */ + ret = 0; /* If we exceed the threshold value, we have found a hardware latency */ @@ -196,11 +241,17 @@ static int get_sample(void) ret = 1; + /* We read in microseconds */ + if (nmi_total_ts) + do_div(nmi_total_ts, NSEC_PER_USEC); + hwlat_data.count++; s.seqnum = hwlat_data.count; s.duration = sample; s.outer_duration = outer_sample; s.timestamp = CURRENT_TIME; + s.nmi_total_ts = nmi_total_ts; + s.nmi_count = nmi_count; trace_hwlat_sample(&s); /* Keep a running maximum ever recorded hardware latency */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d67a562df259..3fc20422c166 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1109,13 +1109,27 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld\n", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", field->seqnum, field->duration, field->outer_duration, field->timestamp.tv_sec, field->timestamp.tv_nsec); + if (field->nmi_count) { + /* + * The generic sched_clock() is not NMI safe, thus + * we only record the count and not the time. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) + trace_seq_printf(s, " nmi-total:%llu", + field->nmi_total_ts); + trace_seq_printf(s, " nmi-count:%u", + field->nmi_count); + } + + trace_seq_putc(s, '\n'); + return trace_handle_return(s); } -- cgit v1.2.3 From ea2e7ce5d0fc878463ba39deb46cf2ab20398fd2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:21 -0700 Subject: bpf: support 8-byte metafield access The verifier supported only 4-byte metafields in struct __sk_buff and struct xdp_md. The metafields in upcoming struct bpf_perf_event are 8-byte to match register width in struct pt_regs. Teach verifier to recognize 8-byte metafield access. The patch doesn't affect safety of sockets and xdp programs. They check for 4-byte only ctx access before these conditions are hit. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index abb61f3f6900..c1c9e441f0f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2333,7 +2333,8 @@ static int do_check(struct verifier_env *env) if (err) return err; - if (BPF_SIZE(insn->code) != BPF_W) { + if (BPF_SIZE(insn->code) != BPF_W && + BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; continue; } @@ -2642,9 +2643,11 @@ static int convert_ctx_accesses(struct verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { u32 insn_delta, cnt; - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) + else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else continue; -- cgit v1.2.3 From 0515e5999a466dfe6e1924f460da599bb6821487 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:22 -0700 Subject: bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type Introduce BPF_PROG_TYPE_PERF_EVENT programs that can be attached to HW and SW perf events (PERF_TYPE_HARDWARE and PERF_TYPE_SOFTWARE correspondingly in uapi/linux/perf_event.h) The program visible context meta structure is struct bpf_perf_event_data { struct pt_regs regs; __u64 sample_period; }; which is accessible directly from the program: int bpf_prog(struct bpf_perf_event_data *ctx) { ... ctx->sample_period ... ... ctx->regs.ip ... } The bpf verifier rewrites the accesses into kernel internal struct bpf_perf_event_data_kern which allows changing struct perf_sample_data without affecting bpf programs. New fields can be added to the end of struct bpf_perf_event_data in the future. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ad35213b8405..d3869b03d9fe 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -8,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -552,10 +554,69 @@ static struct bpf_prog_type_list tracepoint_tl = { .type = BPF_PROG_TYPE_TRACEPOINT, }; +static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + if (off == offsetof(struct bpf_perf_event_data, sample_period)) { + if (size != sizeof(u64)) + return false; + } else { + if (size != sizeof(long)) + return false; + } + return true; +} + +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_perf_event_data, sample_period): + BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)), + dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + offsetof(struct perf_sample_data, period)); + break; + default: + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)), + dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, regs)); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)), + dst_reg, dst_reg, ctx_off); + break; + } + + return insn - insn_buf; +} + +static const struct bpf_verifier_ops perf_event_prog_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = pe_prog_is_valid_access, + .convert_ctx_access = pe_prog_convert_ctx_access, +}; + +static struct bpf_prog_type_list perf_event_tl = { + .ops = &perf_event_prog_ops, + .type = BPF_PROG_TYPE_PERF_EVENT, +}; + static int __init register_kprobe_prog_ops(void) { bpf_register_prog_type(&kprobe_tl); bpf_register_prog_type(&tracepoint_tl); + bpf_register_prog_type(&perf_event_tl); return 0; } late_initcall(register_kprobe_prog_ops); -- cgit v1.2.3 From fdc15d388d600d5a1599e14c700af105a5b60761 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:23 -0700 Subject: bpf: perf_event progs should only use preallocated maps Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use preallocated hash maps, since doing memory allocation in overflow_handler can crash depending on where nmi got triggered. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c1c9e441f0f5..48c2705db22c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2511,6 +2511,20 @@ process_bpf_exit: return 0; } +static int check_map_prog_compatibility(struct bpf_map *map, + struct bpf_prog *prog) + +{ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT && + (map->map_type == BPF_MAP_TYPE_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && + (map->map_flags & BPF_F_NO_PREALLOC)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + return 0; +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -2518,7 +2532,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; - int i, j; + int i, j, err; for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && @@ -2562,6 +2576,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return PTR_ERR(map); } + err = check_map_prog_compatibility(map, env->prog); + if (err) { + fdput(f); + return err; + } + /* store map pointer inside BPF_LD_IMM64 instruction */ insn[0].imm = (u32) (unsigned long) map; insn[1].imm = ((u64) (unsigned long) map) >> 32; -- cgit v1.2.3 From aa6a5f3cb2b2edc5b9aab0b4fdfdfa9c3b5096a8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:24 -0700 Subject: perf, bpf: add perf events core support for BPF_PROG_TYPE_PERF_EVENT programs Allow attaching BPF_PROG_TYPE_PERF_EVENT programs to sw and hw perf events via overflow_handler mechanism. When program is attached the overflow_handlers become stacked. The program acts as a filter. Returning zero from the program means that the normal perf_event_output handler will not be called and sampling event won't be stored in the ring buffer. The overflow_handler_context==NULL is an additional safety check to make sure programs are not attached to hw breakpoints and watchdog in case other checks (that prevent that now anyway) get accidentally relaxed in the future. The program refcnt is incremented in case perf_events are inhereted when target task is forked. Similar to kprobe and tracepoint programs there is no ioctl to detach the program or swap already attached program. The user space expected to close(perf_event_fd) like it does right now for kprobe+bpf. That restriction simplifies the code quite a bit. The invocation of overflow_handler in __perf_event_overflow() is now done via READ_ONCE, since that pointer can be replaced when the program is attached while perf_event itself could have been active already. There is no need to do similar treatment for event->prog, since it's assigned only once before it's accessed. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/events/core.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cfabdf7b942..85bf4c37911f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7022,7 +7022,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - event->overflow_handler(event, data, regs); + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7637,11 +7637,83 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +#ifdef CONFIG_BPF_SYSCALL +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .regs = regs, + }; + int ret = 0; + + preempt_disable(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + ret = BPF_PROG_RUN(event->prog, (void *)&ctx); + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (!ret) + return; + + event->orig_overflow_handler(event, data, regs); +} + +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + event->prog = prog; + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); + return 0; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + return -EOPNOTSUPP; +} +static void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint; struct bpf_prog *prog; + if (event->attr.type == PERF_TYPE_HARDWARE || + event->attr.type == PERF_TYPE_SOFTWARE) + return perf_event_set_bpf_handler(event, prog_fd); + if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; @@ -7682,6 +7754,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; + perf_event_free_bpf_handler(event); + if (!event->tp_event) return; @@ -8998,6 +9072,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; +#ifdef CONFIG_BPF_SYSCALL + if (overflow_handler == bpf_overflow_handler) { + struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto err_ns; + } + event->prog = prog; + event->orig_overflow_handler = + parent_event->orig_overflow_handler; + } +#endif } if (overflow_handler) { -- cgit v1.2.3 From a724632ca0c84b494875e9367e07e29472c139ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Aug 2016 19:49:38 +0200 Subject: cpu/hotplug: Rework callback invocation logic This is preparation for the following patch. This rework here changes the arguments of cpuhp_invoke_callback(). It passes now `state' and whether `startup' or `teardown' callback should be invoked. The callback then is looked up by the function. The following is a clanup of callers: - cpuhp_issue_call() has one argument less - struct cpuhp_cpu_state (which is used by the hotplug thread) gets also its callback removed. The decision if it is a single callback invocation moved to the `single' variable. Also a `bringup' variable has been added to distinguish between startup and teardown callback. - take_cpu_down() needs to start one step earlier. We always get here via CPUHP_TEARDOWN_CPU callback. Before that change cpuhp_ap_states + CPUHP_TEARDOWN_CPU pointed to an empty entry because TEARDOWN is saved in bp_states for this reason. Now that we use cpuhp_get_step() to lookup the state we must explicitly skip it in order not to invoke it twice. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Mark Rutland Cc: Peter Zijlstra Cc: Will Deacon Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/1471024183-12666-2-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 162 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 80 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index ec12b726fa6f..d36d8e0abfb8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -37,8 +37,9 @@ * @thread: Pointer to the hotplug thread * @should_run: Thread should execute * @rollback: Perform a rollback - * @cb_stat: The state for a single callback (install/uninstall) - * @cb: Single callback function (install/uninstall) + * @single: Single callback invocation + * @bringup: Single callback bringup or teardown selector + * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation * @done: Signal completion to the issuer of the task */ @@ -49,8 +50,9 @@ struct cpuhp_cpu_state { struct task_struct *thread; bool should_run; bool rollback; + bool single; + bool bringup; enum cpuhp_state cb_state; - int (*cb)(unsigned int cpu); int result; struct completion done; #endif @@ -79,24 +81,43 @@ static DEFINE_MUTEX(cpuhp_state_mutex); static struct cpuhp_step cpuhp_bp_states[]; static struct cpuhp_step cpuhp_ap_states[]; +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitly in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked * @step: The step in the state machine - * @cb: The callback function to invoke + * @bringup: True if the bringup callback should be invoked * * Called from cpu hotplug and from the state register machinery */ -static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, - int (*cb)(unsigned int)) +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, + bool bringup) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + struct cpuhp_step *step = cpuhp_get_step(state); + int (*cb)(unsigned int cpu) = bringup ? step->startup : step->teardown; int ret = 0; if (cb) { - trace_cpuhp_enter(cpu, st->target, step, cb); + trace_cpuhp_enter(cpu, st->target, state, cb); ret = cb(cpu); - trace_cpuhp_exit(cpu, st->state, step, ret); + trace_cpuhp_exit(cpu, st->state, state, ret); } return ret; } @@ -371,62 +392,55 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps) +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = steps + st->state; + struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->startup); + cpuhp_invoke_callback(cpu, st->state, true); } } static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps, enum cpuhp_state target) + enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; for (; st->state > target; st->state--) { - struct cpuhp_step *step = steps + st->state; - - ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + ret = cpuhp_invoke_callback(cpu, st->state, false); if (ret) { st->target = prev_state; - undo_cpu_down(cpu, st, steps); + undo_cpu_down(cpu, st); break; } } return ret; } -static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps) +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state--; st->state > st->target; st->state--) { - struct cpuhp_step *step = steps + st->state; + struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->teardown); + cpuhp_invoke_callback(cpu, st->state, false); } } static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps, enum cpuhp_state target) + enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; while (st->state < target) { - struct cpuhp_step *step; - st->state++; - step = steps + st->state; - ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + ret = cpuhp_invoke_callback(cpu, st->state, true); if (ret) { st->target = prev_state; - undo_cpu_up(cpu, st, steps); + undo_cpu_up(cpu, st); break; } } @@ -455,13 +469,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) { enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); + return cpuhp_down_callbacks(cpu, st, target); } /* Execute the online startup callbacks. Used to be CPU_ONLINE */ static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) { - return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); + return cpuhp_up_callbacks(cpu, st, st->target); } /* @@ -484,18 +498,20 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; /* Single callback invocation for [un]install ? */ - if (st->cb) { + if (st->single) { if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + ret = cpuhp_invoke_callback(cpu, st->cb_state, + st->bringup); local_irq_enable(); } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + ret = cpuhp_invoke_callback(cpu, st->cb_state, + st->bringup); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - undo_cpu_down(cpu, st, cpuhp_ap_states); + undo_cpu_down(cpu, st); /* * This is a momentary workaround to keep the notifier users * happy. Will go away once we got rid of the notifiers. @@ -517,8 +533,8 @@ static void cpuhp_thread_fun(unsigned int cpu) } /* Invoke a single callback on a remote cpu */ -static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, - int (*cb)(unsigned int)) +static int +cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -530,10 +546,12 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, cb); + return cpuhp_invoke_callback(cpu, state, bringup); st->cb_state = state; - st->cb = cb; + st->single = true; + st->bringup = bringup; + /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() @@ -549,7 +567,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) { st->result = 0; - st->cb = NULL; + st->single = false; /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() @@ -700,12 +718,16 @@ static int take_cpu_down(void *_param) if (err < 0) return err; + /* + * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not + * do this step again. + */ + WARN_ON(st->state != CPUHP_TEARDOWN_CPU); + st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) { - struct cpuhp_step *step = cpuhp_ap_states + st->state; + for (; st->state > target; st->state--) + cpuhp_invoke_callback(cpu, st->state, false); - cpuhp_invoke_callback(cpu, st->state, step->teardown); - } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ @@ -844,7 +866,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need * to do the further cleanups. */ - ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { st->target = prev_state; st->rollback = true; @@ -898,11 +920,8 @@ void notify_cpu_starting(unsigned int cpu) enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); while (st->state < target) { - struct cpuhp_step *step; - st->state++; - step = cpuhp_ap_states + st->state; - cpuhp_invoke_callback(cpu, st->state, step->startup); + cpuhp_invoke_callback(cpu, st->state, true); } } @@ -987,7 +1006,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) * responsible for bringing it up to the target state. */ target = min((int)target, CPUHP_BRINGUP_CPU); - ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); + ret = cpuhp_up_callbacks(cpu, st, target); out: cpu_hotplug_done(); return ret; @@ -1364,23 +1383,6 @@ static int cpuhp_cb_check(enum cpuhp_state state) return 0; } -static bool cpuhp_is_ap_state(enum cpuhp_state state) -{ - /* - * The extra check for CPUHP_TEARDOWN_CPU is only for documentation - * purposes as that state is handled explicitely in cpu_down. - */ - return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; -} - -static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) -{ - struct cpuhp_step *sp; - - sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; - return sp + state; -} - static void cpuhp_store_callbacks(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), @@ -1406,12 +1408,12 @@ static void *cpuhp_get_teardown_cb(enum cpuhp_state state) * Call the startup/teardown function for a step either on the AP or * on the current CPU. */ -static int cpuhp_issue_call(int cpu, enum cpuhp_state state, - int (*cb)(unsigned int), bool bringup) +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup) { + struct cpuhp_step *sp = cpuhp_get_step(state); int ret; - if (!cb) + if ((bringup && !sp->startup) || (!bringup && !sp->teardown)) return 0; /* * The non AP bound callbacks can fail on bringup. On teardown @@ -1419,11 +1421,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, */ #ifdef CONFIG_SMP if (cpuhp_is_ap_state(state)) - ret = cpuhp_invoke_ap_callback(cpu, state, cb); + ret = cpuhp_invoke_ap_callback(cpu, state, bringup); else - ret = cpuhp_invoke_callback(cpu, state, cb); + ret = cpuhp_invoke_callback(cpu, state, bringup); #else - ret = cpuhp_invoke_callback(cpu, state, cb); + ret = cpuhp_invoke_callback(cpu, state, bringup); #endif BUG_ON(ret && !bringup); return ret; @@ -1434,14 +1436,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, * * Note: The teardown callbacks for rollback are not allowed to fail! */ -static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, - int (*teardown)(unsigned int cpu)) +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state) { int cpu; - if (!teardown) - return; - /* Roll back the already executed steps on the other cpus */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -1452,7 +1450,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, /* Did we invoke the startup call on that cpu ? */ if (cpustate >= state) - cpuhp_issue_call(cpu, state, teardown, false); + cpuhp_issue_call(cpu, state, false); } } @@ -1527,9 +1525,10 @@ int __cpuhp_setup_state(enum cpuhp_state state, if (cpustate < state) continue; - ret = cpuhp_issue_call(cpu, state, startup, true); + ret = cpuhp_issue_call(cpu, state, true); if (ret) { - cpuhp_rollback_install(cpu, state, teardown); + if (teardown) + cpuhp_rollback_install(cpu, state); cpuhp_store_callbacks(state, NULL, NULL, NULL); goto out; } @@ -1553,14 +1552,13 @@ EXPORT_SYMBOL(__cpuhp_setup_state); */ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) { - int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); int cpu; BUG_ON(cpuhp_cb_check(state)); get_online_cpus(); - if (!invoke || !teardown) + if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* @@ -1573,7 +1571,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) int cpustate = st->state; if (cpustate >= state) - cpuhp_issue_call(cpu, state, teardown, false); + cpuhp_issue_call(cpu, state, false); } remove: cpuhp_store_callbacks(state, NULL, NULL, NULL); -- cgit v1.2.3 From cf392d10b69e6e6c57ceea48b347a2ab1a4b75b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Aug 2016 19:49:39 +0200 Subject: cpu/hotplug: Add multi instance support This patch adds the ability for a given state to have multiple instances. Until now all states have a single instance and the startup / teardown callback use global variables. A few drivers need to perform a the same callbacks on multiple "instances". Currently we have three drivers in tree which all have a global list which they iterate over. With multi instance they support don't need their private list and the functionality has been moved into core code. Plus we hold the hotplug lock in core so no cpus comes/goes while instances are registered and we do rollback in error case :) Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Mark Rutland Cc: Peter Zijlstra Cc: Will Deacon Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/1471024183-12666-3-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 183 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d36d8e0abfb8..c506485eaa75 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -52,6 +52,7 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; + struct hlist_node *node; enum cpuhp_state cb_state; int result; struct completion done; @@ -70,11 +71,21 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); * @cant_stop: Bringup/teardown can't be stopped at this step */ struct cpuhp_step { - const char *name; - int (*startup)(unsigned int cpu); - int (*teardown)(unsigned int cpu); - bool skip_onerr; - bool cant_stop; + const char *name; + union { + int (*startup)(unsigned int cpu); + int (*startup_multi)(unsigned int cpu, + struct hlist_node *node); + }; + union { + int (*teardown)(unsigned int cpu); + int (*teardown_multi)(unsigned int cpu, + struct hlist_node *node); + }; + struct hlist_head list; + bool skip_onerr; + bool cant_stop; + bool multi_instance; }; static DEFINE_MUTEX(cpuhp_state_mutex); @@ -104,20 +115,59 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) * @step: The step in the state machine * @bringup: True if the bringup callback should be invoked * - * Called from cpu hotplug and from the state register machinery + * Called from cpu hotplug and from the state register machinery. */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, - bool bringup) + bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); - int (*cb)(unsigned int cpu) = bringup ? step->startup : step->teardown; - int ret = 0; - - if (cb) { + int (*cbm)(unsigned int cpu, struct hlist_node *node); + int (*cb)(unsigned int cpu); + int ret, cnt; + + if (!step->multi_instance) { + cb = bringup ? step->startup : step->teardown; + if (!cb) + return 0; trace_cpuhp_enter(cpu, st->target, state, cb); ret = cb(cpu); trace_cpuhp_exit(cpu, st->state, state, ret); + return ret; + } + cbm = bringup ? step->startup_multi : step->teardown_multi; + if (!cbm) + return 0; + + /* Single invocation for instance add/remove */ + if (node) { + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + return ret; + } + + /* State transition. Invoke on all instances */ + cnt = 0; + hlist_for_each(node, &step->list) { + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + if (ret) + goto err; + cnt++; + } + return 0; +err: + /* Rollback the instances if one failed */ + cbm = !bringup ? step->startup_multi : step->teardown_multi; + if (!cbm) + return ret; + + hlist_for_each(node, &step->list) { + if (!cnt--) + break; + cbm(cpu, node); } return ret; } @@ -398,7 +448,7 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true); + cpuhp_invoke_callback(cpu, st->state, true, NULL); } } @@ -409,7 +459,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, int ret = 0; for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false); + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); if (ret) { st->target = prev_state; undo_cpu_down(cpu, st); @@ -425,7 +475,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, false); + cpuhp_invoke_callback(cpu, st->state, false, NULL); } } @@ -437,7 +487,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, while (st->state < target) { st->state++; - ret = cpuhp_invoke_callback(cpu, st->state, true); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); if (ret) { st->target = prev_state; undo_cpu_up(cpu, st); @@ -502,11 +552,11 @@ static void cpuhp_thread_fun(unsigned int cpu) if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup); + st->bringup, st->node); local_irq_enable(); } else { ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup); + st->bringup, st->node); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); @@ -534,7 +584,8 @@ static void cpuhp_thread_fun(unsigned int cpu) /* Invoke a single callback on a remote cpu */ static int -cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup) +cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, + struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -546,11 +597,12 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup) * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, bringup); + return cpuhp_invoke_callback(cpu, state, bringup, node); st->cb_state = state; st->single = true; st->bringup = bringup; + st->node = node; /* * Make sure the above stores are visible before should_run becomes @@ -726,7 +778,7 @@ static int take_cpu_down(void *_param) st->state--; /* Invoke the former CPU_DYING callbacks */ for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false); + cpuhp_invoke_callback(cpu, st->state, false, NULL); /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -921,7 +973,7 @@ void notify_cpu_starting(unsigned int cpu) while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true); + cpuhp_invoke_callback(cpu, st->state, true, NULL); } } @@ -1386,7 +1438,8 @@ static int cpuhp_cb_check(enum cpuhp_state state) static void cpuhp_store_callbacks(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu)) + int (*teardown)(unsigned int cpu), + bool multi_instance) { /* (Un)Install the callbacks for further cpu hotplug operations */ struct cpuhp_step *sp; @@ -1396,6 +1449,8 @@ static void cpuhp_store_callbacks(enum cpuhp_state state, sp->startup = startup; sp->teardown = teardown; sp->name = name; + sp->multi_instance = multi_instance; + INIT_HLIST_HEAD(&sp->list); mutex_unlock(&cpuhp_state_mutex); } @@ -1408,7 +1463,8 @@ static void *cpuhp_get_teardown_cb(enum cpuhp_state state) * Call the startup/teardown function for a step either on the AP or * on the current CPU. */ -static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup) +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, + struct hlist_node *node) { struct cpuhp_step *sp = cpuhp_get_step(state); int ret; @@ -1421,11 +1477,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup) */ #ifdef CONFIG_SMP if (cpuhp_is_ap_state(state)) - ret = cpuhp_invoke_ap_callback(cpu, state, bringup); + ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, bringup); + ret = cpuhp_invoke_callback(cpu, state, bringup, node); #else - ret = cpuhp_invoke_callback(cpu, state, bringup); + ret = cpuhp_invoke_callback(cpu, state, bringup, node); #endif BUG_ON(ret && !bringup); return ret; @@ -1436,7 +1492,8 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup) * * Note: The teardown callbacks for rollback are not allowed to fail! */ -static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state) +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, + struct hlist_node *node) { int cpu; @@ -1450,7 +1507,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state) /* Did we invoke the startup call on that cpu ? */ if (cpustate >= state) - cpuhp_issue_call(cpu, state, false); + cpuhp_issue_call(cpu, state, false, node); } } @@ -1477,6 +1534,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state) return -ENOSPC; } +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke) +{ + struct cpuhp_step *sp; + int cpu; + int ret; + + sp = cpuhp_get_step(state); + if (sp->multi_instance == false) + return -EINVAL; + + get_online_cpus(); + + if (!invoke || !sp->startup_multi) + goto add_node; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, true, node); + if (ret) { + if (sp->teardown_multi) + cpuhp_rollback_install(cpu, state, node); + goto err; + } + } +add_node: + ret = 0; + mutex_lock(&cpuhp_state_mutex); + hlist_add_head(node, &sp->list); + mutex_unlock(&cpuhp_state_mutex); + +err: + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); + /** * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state * @state: The state to setup @@ -1490,7 +1593,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state) int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu)) + int (*teardown)(unsigned int cpu), + bool multi_instance) { int cpu, ret = 0; int dyn_state = 0; @@ -1509,7 +1613,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, state = ret; } - cpuhp_store_callbacks(state, name, startup, teardown); + cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); if (!invoke || !startup) goto out; @@ -1525,11 +1629,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, if (cpustate < state) continue; - ret = cpuhp_issue_call(cpu, state, true); + ret = cpuhp_issue_call(cpu, state, true, NULL); if (ret) { if (teardown) - cpuhp_rollback_install(cpu, state); - cpuhp_store_callbacks(state, NULL, NULL, NULL); + cpuhp_rollback_install(cpu, state, NULL); + cpuhp_store_callbacks(state, NULL, NULL, NULL, false); goto out; } } @@ -1541,6 +1645,42 @@ out: } EXPORT_SYMBOL(__cpuhp_setup_state); +int __cpuhp_state_remove_instance(enum cpuhp_state state, + struct hlist_node *node, bool invoke) +{ + struct cpuhp_step *sp = cpuhp_get_step(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + if (!sp->multi_instance) + return -EINVAL; + + get_online_cpus(); + if (!invoke || !cpuhp_get_teardown_cb(state)) + goto remove; + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, false, node); + } + +remove: + mutex_lock(&cpuhp_state_mutex); + hlist_del(node); + mutex_unlock(&cpuhp_state_mutex); + put_online_cpus(); + + return 0; +} +EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state * @state: The state to remove @@ -1552,12 +1692,20 @@ EXPORT_SYMBOL(__cpuhp_setup_state); */ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) { + struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); get_online_cpus(); + if (sp->multi_instance) { + WARN(!hlist_empty(&sp->list), + "Error: Removing state %d which has instances left.\n", + state); + goto remove; + } + if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; @@ -1571,10 +1719,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) int cpustate = st->state; if (cpustate >= state) - cpuhp_issue_call(cpu, state, false); + cpuhp_issue_call(cpu, state, false, NULL); } remove: - cpuhp_store_callbacks(state, NULL, NULL, NULL); + cpuhp_store_callbacks(state, NULL, NULL, NULL, false); put_online_cpus(); } EXPORT_SYMBOL(__cpuhp_remove_state); -- cgit v1.2.3 From f88eecfe2f22b2790e7527c0aaec14ea175919de Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Tue, 16 Aug 2016 16:05:08 +0200 Subject: genirq/generic_chip: Verify irqs_per_chip <= 32 Most (if not all) code here implicitly assumes that the maximum number of IRQs per chip will be 32, and thus uses 'u32' or 'unsigned long' for many tasks (for example "struct irq_data" declares its 'mask' field as 'u32', and "struct irq_chip_generic" declares its 'installed' field as 'unsigned long') However, there is no check to verify that irqs_per_chip is <= 32. Hence, calling irq_alloc_domain_generic_chips() with a bigger value will result in unexpected results. Provide a wrapper with a MAYBE_BUILD_BUG_ON(nrirqs >= 32) to catch such cases. [ tglx: Reduced changelog to the essential information ] Signed-off-by: Sebastian Frias Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/57B31D94.5040701@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index a3a392097804..ee32870079c9 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } /** - * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain * @d: irq domain for which to allocate chips - * @irqs_per_chip: Number of interrupts each chip handles + * @irqs_per_chip: Number of interrupts each chip handles (max 32) * @num_ct: Number of irq_chip_type instances associated with this * @name: Name of the irq chip * @handler: Default flow handler associated with these chips @@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) * @set: IRQ_* bits to set in the mapping function * @gcflags: Generic chip specific setup flags */ -int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, - int num_ct, const char *name, - irq_flow_handler_t handler, - unsigned int clr, unsigned int set, - enum irq_gc_flags gcflags) +int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags gcflags) { struct irq_domain_chip_generic *dgc; struct irq_chip_generic *gc; @@ -326,7 +326,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, d->name = name; return 0; } -EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); +EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); static struct irq_chip_generic * __irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) -- cgit v1.2.3 From 01b41159066531cc8d664362ff0cd89dd137bbfa Mon Sep 17 00:00:00 2001 From: Lianwei Wang Date: Thu, 9 Jun 2016 23:43:28 -0700 Subject: cpu/hotplug: Handle unbalanced hotplug enable/disable When cpu_hotplug_enable() is called unbalanced w/o a preceeding cpu_hotplug_disable() the code emits a warning, but happily decrements the disabled counter. This causes the next operations to malfunction. Prevent the decrement and just emit a warning. Signed-off-by: Lianwei Wang Cc: peterz@infradead.org Cc: linux-pm@vger.kernel.org Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/1465541008-12476-1-git-send-email-lianwei.wang@gmail.com Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index c506485eaa75..c90f839c5b86 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -331,10 +331,17 @@ void cpu_hotplug_disable(void) } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); +static void __cpu_hotplug_enable(void) +{ + if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) + return; + cpu_hotplug_disabled--; +} + void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); @@ -1160,7 +1167,7 @@ void enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; -- cgit v1.2.3 From 58763148758057ffc447bf990321d3ea86d199a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Aug 2016 10:15:03 +0200 Subject: perf/core: Remove WARN from perf_event_read() This effectively reverts commit: 71e7bc2bab77 ("perf/core: Check return value of the perf_event_read() IPI") ... and puts in a comment explaining why we ignore the return value. Reported-by: Vegard Nossum Signed-off-by: Peter Zijlstra (Intel) Cc: David Carrillo-Cisneros Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 71e7bc2bab77 ("perf/core: Check return value of the perf_event_read() IPI") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cfabdf7b942..07ac8596a728 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3549,10 +3549,18 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); - /* The event must have been read from an online CPU: */ - WARN_ON_ONCE(ret); - ret = ret ? : data.ret; + /* + * Purposely ignore the smp_call_function_single() return + * value. + * + * If event->oncpu isn't a valid CPU it means the event got + * scheduled out and that will have updated the event count. + * + * Therefore, either way, we'll have an up-to-date event count + * after this. + */ + (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; -- cgit v1.2.3 From 135e8c9250dd5c8c9aae5984fde6f230d0cbfeaf Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 5 Sep 2016 13:16:40 +1000 Subject: sched/core: Fix a race between try_to_wake_up() and a woken up task The origin of the issue I've seen is related to a missing memory barrier between check for task->state and the check for task->on_rq. The task being woken up is already awake from a schedule() and is doing the following: do { schedule() set_current_state(TASK_(UN)INTERRUPTIBLE); } while (!cond); The waker, actually gets stuck doing the following in try_to_wake_up(): while (p->on_cpu) cpu_relax(); Analysis: The instance I've seen involves the following race: CPU1 CPU2 while () { if (cond) break; do { schedule(); set_current_state(TASK_UN..) } while (!cond); wakeup_routine() spin_lock_irqsave(wait_lock) raw_spin_lock_irqsave(wait_lock) wake_up_process() } try_to_wake_up() set_current_state(TASK_RUNNING); .. list_del(&waiter.list); CPU2 wakes up CPU1, but before it can get the wait_lock and set current state to TASK_RUNNING the following occurs: CPU3 wakeup_routine() raw_spin_lock_irqsave(wait_lock) if (!list_empty) wake_up_process() try_to_wake_up() raw_spin_lock_irqsave(p->pi_lock) .. if (p->on_rq && ttwu_wakeup()) .. while (p->on_cpu) cpu_relax() .. CPU3 tries to wake up the task on CPU1 again since it finds it on the wait_queue, CPU1 is spinning on wait_lock, but immediately after CPU2, CPU3 got it. CPU3 checks the state of p on CPU1, it is TASK_UNINTERRUPTIBLE and the task is spinning on the wait_lock. Interestingly since p->on_rq is checked under pi_lock, I've noticed that try_to_wake_up() finds p->on_rq to be 0. This was the most confusing bit of the analysis, but p->on_rq is changed under runqueue lock, rq_lock, the p->on_rq check is not reliable without this fix IMHO. The race is visible (based on the analysis) only when ttwu_queue() does a remote wakeup via ttwu_queue_remote. In which case the p->on_rq change is not done uder the pi_lock. The result is that after a while the entire system locks up on the raw_spin_irqlock_save(wait_lock) and the holder spins infintely Reproduction of the issue: The issue can be reproduced after a long run on my system with 80 threads and having to tweak available memory to very low and running memory stress-ng mmapfork test. It usually takes a long time to reproduce. I am trying to work on a test case that can reproduce the issue faster, but thats work in progress. I am still testing the changes on my still in a loop and the tests seem OK thus far. Big thanks to Benjamin and Nick for helping debug this as well. Ben helped catch the missing barrier, Nick caught every missing bit in my theory. Signed-off-by: Balbir Singh [ Updated comment to clarify matching barriers. Many architectures do not have a full barrier in switch_to() so that cannot be relied upon. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Benjamin Herrenschmidt Cc: Alexey Kardashevskiy Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/e02cce7b-d9ca-1ad0-7a61-ea97c7582b37@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a906f20fba7..44817c640e99 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2016,6 +2016,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; -- cgit v1.2.3 From c9bbdd4830ab06288bb1d8c00ed8c8c6e80e377a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 15 Aug 2016 11:42:45 +0100 Subject: perf/core: Don't pass PERF_EF_START to the PMU ->start callback PERF_EF_START is a flag to indicate to the PMU ->add() callback that, as well as claiming the PMU resources required by the event being added, it should also start the PMU. Passing this flag to the ->start() callback doesn't make sense, because ->start() always tries to start the PMU. Remove it. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1471257765-29662-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index dff00c787867..74f22a95eb63 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2492,7 +2492,7 @@ static int __perf_event_stop(void *info) * while restarting. */ if (sd->restart) - event->pmu->start(event, PERF_EF_START); + event->pmu->start(event, 0); return 0; } -- cgit v1.2.3 From 97a7142f157a6361a659ff3eec2c3cf636bd7490 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Sun, 5 Jul 2015 18:33:48 +0900 Subject: sched/fair: Make update_min_vruntime() more readable The update_min_vruntime() control flow can be simplified. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: minchan.kim@lge.com Link: http://lkml.kernel.org/r/1436088829-25768-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61d485421bed..9a18aae0b0ad 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -464,20 +464,17 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = cfs_rq->min_vruntime; - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; - if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); - if (!cfs_rq->curr) - vruntime = se->vruntime; - else - vruntime = min_vruntime(vruntime, se->vruntime); + vruntime = se->vruntime; } + if (cfs_rq->curr) + vruntime = min_vruntime(vruntime, cfs_rq->curr->vruntime); + /* ensure we never gain time by being placed backwards. */ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); #ifndef CONFIG_64BIT @@ -5988,7 +5985,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * The adjacency matrix of the resulting graph is given by: * - * log_2 n + * log_2 n * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) * k = 0 * @@ -6034,7 +6031,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * [XXX write more on how we solve this.. _after_ merging pjt's patches that * rewrite all of this once again.] - */ + */ static unsigned long __read_mostly max_load_balance_interval = HZ/10; @@ -6696,7 +6693,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) /* * !SD_OVERLAP domains can assume that child groups * span the current group. - */ + */ group = child->groups; do { -- cgit v1.2.3 From 126b3b6842cc848fc9880e7816e0a8d743be51f1 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Sun, 14 Aug 2016 16:27:06 +0200 Subject: sched/deadline: Refactor CPU heap code 1. heapify up factored out in new dedicated function heapify_up() (avoids repetition of same code) 2. call to cpudl_change_key() replaced with heapify_up() when cpudl_set actually inserts a new node in the heap 3. cpudl_change_key() replaced with heapify() that heapifies up or down as needed. Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luca Abeni Reviewed-by: Juri Lelli Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-dl@retis.sssup.it Link: http://lkml.kernel.org/r/1471184828-12644-2-git-send-email-tommaso.cucinotta@sssup.it Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 50 +++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index d4184498c9f5..0acb0d4e2fb7 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -41,7 +41,7 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); } -static void cpudl_heapify(struct cpudl *cp, int idx) +static void cpudl_heapify_down(struct cpudl *cp, int idx) { int l, r, largest; @@ -66,23 +66,24 @@ static void cpudl_heapify(struct cpudl *cp, int idx) } } -static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +static void cpudl_heapify_up(struct cpudl *cp, int idx) { - WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); - - if (dl_time_before(new_dl, cp->elements[idx].dl)) { - cp->elements[idx].dl = new_dl; - cpudl_heapify(cp, idx); - } else { - cp->elements[idx].dl = new_dl; - while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, - cp->elements[idx].dl)) { - cpudl_exchange(cp, idx, parent(idx)); - idx = parent(idx); - } + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) { + cpudl_exchange(cp, idx, parent(idx)); + idx = parent(idx); } } +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) + cpudl_heapify_up(cp, idx); + else + cpudl_heapify_down(cp, idx); +} + static inline int cpudl_maximum(struct cpudl *cp) { return cp->elements[0].cpu; @@ -154,27 +155,22 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->size--; cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; - while (old_idx > 0 && dl_time_before( - cp->elements[parent(old_idx)].dl, - cp->elements[old_idx].dl)) { - cpudl_exchange(cp, old_idx, parent(old_idx)); - old_idx = parent(old_idx); - } + cpudl_heapify(cp, old_idx); cpumask_set_cpu(cpu, cp->free_cpus); - cpudl_heapify(cp, old_idx); goto out; } if (old_idx == IDX_INVALID) { - cp->size++; - cp->elements[cp->size - 1].dl = dl; - cp->elements[cp->size - 1].cpu = cpu; - cp->elements[cpu].idx = cp->size - 1; - cpudl_change_key(cp, cp->size - 1, dl); + int new_idx = cp->size++; + cp->elements[new_idx].dl = dl; + cp->elements[new_idx].cpu = cpu; + cp->elements[cpu].idx = new_idx; + cpudl_heapify_up(cp, new_idx); cpumask_clear_cpu(cpu, cp->free_cpus); } else { - cpudl_change_key(cp, old_idx, dl); + cp->elements[old_idx].dl = dl; + cpudl_heapify(cp, old_idx); } out: -- cgit v1.2.3 From 8e1bc301aaf9f9a2d731bf8d50d549ac2dcfdab2 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Sun, 14 Aug 2016 16:27:07 +0200 Subject: sched/deadline: Make CPU heap faster avoiding real swaps on heapify This change goes from heapify() ops done by swapping with parent/child so that the item to fix moves along, to heapify() ops done by just pulling the parent/child chain by 1 pos, then storing the item to fix just at the end. On a non-trivial heapify(), this performs roughly half stores wrt swaps. This has been measured to achieve up to 10% of speed-up for cpudl_set() calls, with a randomly generated workload of 1K,10K,100K random heap insertions and deletions (75% cpudl_set() calls with is_valid=1 and 25% with is_valid=0), and randomly generated cpu IDs, with up to 256 CPUs, as measured on an Intel Core2 Duo. Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luca Abeni Reviewed-by: Juri Lelli Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-dl@retis.sssup.it Link: http://lkml.kernel.org/r/1471184828-12644-3-git-send-email-tommaso.cucinotta@sssup.it Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 66 +++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 0acb0d4e2fb7..0ace75a7a87b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,48 +31,72 @@ static inline int right_child(int i) return (i << 1) + 2; } -static void cpudl_exchange(struct cpudl *cp, int a, int b) -{ - int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; - - swap(cp->elements[a].cpu, cp->elements[b].cpu); - swap(cp->elements[a].dl , cp->elements[b].dl ); - - swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); -} - static void cpudl_heapify_down(struct cpudl *cp, int idx) { int l, r, largest; + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; + + if (left_child(idx) >= cp->size) + return; + /* adapted from lib/prio_heap.c */ while(1) { + u64 largest_dl; l = left_child(idx); r = right_child(idx); largest = idx; + largest_dl = orig_dl; - if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, - cp->elements[l].dl)) + if ((l < cp->size) && dl_time_before(orig_dl, + cp->elements[l].dl)) { largest = l; - if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, - cp->elements[r].dl)) + largest_dl = cp->elements[l].dl; + } + if ((r < cp->size) && dl_time_before(largest_dl, + cp->elements[r].dl)) largest = r; + if (largest == idx) break; - /* Push idx down the heap one level and bump one up */ - cpudl_exchange(cp, largest, idx); + /* pull largest child onto idx */ + cp->elements[idx].cpu = cp->elements[largest].cpu; + cp->elements[idx].dl = cp->elements[largest].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; idx = largest; } + /* actual push down of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; } static void cpudl_heapify_up(struct cpudl *cp, int idx) { - while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, - cp->elements[idx].dl)) { - cpudl_exchange(cp, idx, parent(idx)); - idx = parent(idx); - } + int p; + + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; + + if (idx == 0) + return; + + do { + p = parent(idx); + if (dl_time_before(orig_dl, cp->elements[p].dl)) + break; + /* pull parent onto idx */ + cp->elements[idx].cpu = cp->elements[p].cpu; + cp->elements[idx].dl = cp->elements[p].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; + idx = p; + } while (idx != 0); + /* actual push up of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; } static void cpudl_heapify(struct cpudl *cp, int idx) -- cgit v1.2.3 From d8206bb3ffe0eaee03abfad46fd44d8b17142e88 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Sun, 14 Aug 2016 16:27:08 +0200 Subject: sched/deadline: Split cpudl_set() into cpudl_set() and cpudl_clear() These 2 exercise independent code paths and need different arguments. After this change, you call: cpudl_clear(cp, cpu); cpudl_set(cp, cpu, dl); instead of: cpudl_set(cp, cpu, 0 /* dl */, 0 /* is_valid */); cpudl_set(cp, cpu, dl, 1 /* is_valid */); Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luca Abeni Reviewed-by: Juri Lelli Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-dl@retis.sssup.it Link: http://lkml.kernel.org/r/1471184828-12644-4-git-send-email-tommaso.cucinotta@sssup.it Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 49 +++++++++++++++++++++++++++++++--------------- kernel/sched/cpudeadline.h | 3 ++- kernel/sched/deadline.c | 10 +++++----- 3 files changed, 40 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 0ace75a7a87b..e73119013c53 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -145,16 +145,15 @@ out: } /* - * cpudl_set - update the cpudl max-heap + * cpudl_clear - remove a cpu from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target cpu - * @dl: the new earliest deadline for this cpu * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +void cpudl_clear(struct cpudl *cp, int cpu) { int old_idx, new_cpu; unsigned long flags; @@ -162,17 +161,15 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->elements[cpu].idx; - if (!is_valid) { - /* remove item */ - if (old_idx == IDX_INVALID) { - /* - * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is - * called for a CPU without -dl tasks running. - */ - goto out; - } + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + } else { new_cpu = cp->elements[cp->size - 1].cpu; cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; @@ -180,11 +177,32 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; cpudl_heapify(cp, old_idx); - cpumask_set_cpu(cpu, cp->free_cpus); - goto out; + cpumask_set_cpu(cpu, cp->free_cpus); } + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl) +{ + int old_idx; + unsigned long flags; + + WARN_ON(!cpu_present(cpu)); + raw_spin_lock_irqsave(&cp->lock, flags); + + old_idx = cp->elements[cpu].idx; if (old_idx == IDX_INVALID) { int new_idx = cp->size++; cp->elements[new_idx].dl = dl; @@ -197,7 +215,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cpudl_heapify(cp, old_idx); } -out: raw_spin_unlock_irqrestore(&cp->lock, flags); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index fcbdf83fed7e..f7da8c55bba0 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -23,7 +23,8 @@ struct cpudl { #ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d091f4a95416..18fb0b8fc911 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -798,7 +798,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { dl_rq->earliest_dl.curr = deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } } @@ -813,14 +813,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { struct rb_node *leftmost = dl_rq->rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } } @@ -1671,7 +1671,7 @@ static void rq_online_dl(struct rq *rq) cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) - cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); } /* Assumes rq->lock is held */ @@ -1680,7 +1680,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } -- cgit v1.2.3 From 2665621506e178a1f62e59200403c359c463ea5e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 10 Aug 2016 11:27:27 +0100 Subject: sched/fair: Fix load_above_capacity fixed point arithmetic width Since commit: 2159197d6677 ("sched/core: Enable increased load resolution on 64-bit kernels") we now have two different fixed point units for load. load_above_capacity has to have 10 bits fixed point unit like PELT, whereas NICE_0_LOAD has 20 bit fixed point unit on 64-bit kernels. Fix this by scaling down NICE_0_LOAD when multiplying load_above_capacity with it. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Acked-by: Morten Rasmussen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Link: http://lkml.kernel.org/r/1470824847-5316-1-git-send-email-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9a18aae0b0ad..6011bfe81665 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7193,7 +7193,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; - load_above_capacity *= NICE_0_LOAD; + load_above_capacity *= scale_load_down(NICE_0_LOAD); load_above_capacity /= busiest->group_capacity; } else load_above_capacity = ~0UL; -- cgit v1.2.3 From efca03ecbe29a46c2c5ae539563b6326af9dcba7 Mon Sep 17 00:00:00 2001 From: "seokhoon.yoon" Date: Tue, 16 Aug 2016 18:26:08 +0900 Subject: schedcore: Remove duplicated init_task's preempt_notifiers init init_task's preempt_notifiers is initialized twice: 1) sched_init() -> INIT_HLIST_HEAD(&init_task.preempt_notifiers) 2) sched_init() -> init_idle(current,) <--- current task is init_task at this time -> __sched_fork(,current) -> INIT_HLIST_HEAD(&p->preempt_notifiers) I think the first one is unnecessary, so remove it. Signed-off-by: seokhoon.yoon Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471339568-5790-1-git-send-email-iamyooon@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d602f508ca1..90b1961f6ea5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7554,10 +7554,6 @@ void __init sched_init(void) set_load_weight(&init_task); -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ -- cgit v1.2.3 From 61c7aca695b6fabe85d0fc424fe8ae2f66f267dd Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 31 Aug 2016 18:27:44 +0800 Subject: sched/deadline: Fix the intention to re-evalute tick dependency for offline CPU The dl task will be replenished after dl task timer fire and start a new period. It will be enqueued and to re-evaluate its dependency on the tick in order to restart it. However, if the CPU is hot-unplugged, irq_work_queue will splash since the target CPU is offline. As a result we get: WARNING: CPU: 2 PID: 0 at kernel/irq_work.c:69 irq_work_queue_on+0xad/0xe0 Call Trace: dump_stack+0x99/0xd0 __warn+0xd1/0xf0 warn_slowpath_null+0x1d/0x20 irq_work_queue_on+0xad/0xe0 tick_nohz_full_kick_cpu+0x44/0x50 tick_nohz_dep_set_cpu+0x74/0xb0 enqueue_task_dl+0x226/0x480 activate_task+0x5c/0xa0 dl_task_timer+0x19b/0x2c0 ? push_dl_task.part.31+0x190/0x190 This can be triggered by hot-unplugging the full dynticks CPU which dl task is running on. We enqueue the dl task on the offline CPU, because we need to do replenish for start_dl_timer(). So, as Juri pointed out, we would need to do is calling replenish_dl_entity() directly, instead of enqueue_task_dl(). pi_se shouldn't be a problem as the task shouldn't be boosted if it was throttled. This patch fixes it by avoiding the whole enqueue+dequeue+enqueue story, by first migrating (set_task_cpu()) and then doing 1 enqueue. Suggested-by: Peter Zijlstra Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1472639264-3932-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 46 ++++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 18fb0b8fc911..0c75bc656178 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; - bool fallback = false; later_rq = find_lock_later_rq(p, rq); - if (!later_rq) { int cpu; @@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * If we cannot preempt any rq, fall back to pick any * online cpu. */ - fallback = true; cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); if (cpu >= nr_cpu_ids) { /* @@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p double_lock_balance(rq, later_rq); } - /* - * By now the task is replenished and enqueued; migrate it. - */ - deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); - activate_task(later_rq, p, 0); - - if (!fallback) - resched_curr(later_rq); - double_unlock_balance(later_rq, rq); return later_rq; @@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); - #ifdef CONFIG_SMP - /* - * Perform balancing operations here; after the replenishments. We - * cannot drop rq->lock before this, otherwise the assertion in - * start_dl_timer() about not missing updates is not true. - * - * If we find that the rq the task was on is no longer available, we - * need to select a new rq. - * - * XXX figure out if select_task_rq_dl() deals with offline cpus. - */ if (unlikely(!rq->online)) { + /* + * If the runqueue is no longer available, migrate the + * task elsewhere. This necessarily changes rq. + */ lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); rf.cookie = lockdep_pin_lock(&rq->lock); + + /* + * Now that the task has been migrated to the new RQ and we + * have that locked, proceed as normal and enqueue the task + * there. + */ } +#endif + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + +#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. -- cgit v1.2.3 From 1a3d027c5a6847e5d349c8527f99aada47e5467a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jun 2016 12:43:23 -0500 Subject: sched/debug: Rename and move enqueue_sleeper() enqueue_sleeper() doesn't actually enqueue, it just handles some statistics and tracepoints. Rename it to update_stats_enqueue_sleeper() and call it from update_stats_enqueue(). Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/fb20b7159dc4d028c406c0e8d5f8c439b741615b.1466184592.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 142 +++++++++++++++++++++++++++------------------------- 1 file changed, 73 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6011bfe81665..479639f6dc80 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -862,11 +862,72 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) se->statistics.wait_start = 0; } +static void +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *tsk = NULL; + + if (entity_is_task(se)) + tsk = task_of(se); + + if (se->statistics.sleep_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.sleep_max)) + se->statistics.sleep_max = delta; + + se->statistics.sleep_start = 0; + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } + } + if (se->statistics.block_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.block_max)) + se->statistics.block_max = delta; + + se->statistics.block_start = 0; + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + if (tsk->in_iowait) { + se->statistics.iowait_sum += delta; + se->statistics.iowait_count++; + trace_sched_stat_iowait(tsk, delta); + } + + trace_sched_stat_blocked(tsk, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } +} + /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Are we enqueueing a waiting task? (for current tasks @@ -874,6 +935,9 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper(cfs_rq, se); } static inline void @@ -910,7 +974,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { } @@ -3197,68 +3266,6 @@ static inline int idle_balance(struct rq *rq) #endif /* CONFIG_SMP */ -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHEDSTATS - struct task_struct *tsk = NULL; - - if (entity_is_task(se)) - tsk = task_of(se); - - if (se->statistics.sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.sleep_max)) - se->statistics.sleep_max = delta; - - se->statistics.sleep_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (se->statistics.block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.block_max)) - se->statistics.block_max = delta; - - se->statistics.block_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - if (tsk->in_iowait) { - se->statistics.iowait_sum += delta; - se->statistics.iowait_count++; - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } -#endif -} - static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHED_DEBUG @@ -3385,15 +3392,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); - if (flags & ENQUEUE_WAKEUP) { + if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); - if (schedstat_enabled()) - enqueue_sleeper(cfs_rq, se); - } check_schedstat_required(); if (schedstat_enabled()) { - update_stats_enqueue(cfs_rq, se); + update_stats_enqueue(cfs_rq, se, flags); check_spread(cfs_rq, se); } if (!curr) -- cgit v1.2.3 From ae92882e5646d8661a3ca182ba988752fe4b773f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jun 2016 12:43:24 -0500 Subject: sched/debug: Clean up schedstat macros The schedstat_*() macros are inconsistent: most of them take a pointer and a field which the macro combines, whereas schedstat_set() takes the already combined ptr->field. The already combined ptr->field argument is actually more intuitive and easier to use, and there's no reason to require the user to split the variable up, so convert the macros to use the combined argument. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/54953ca25bb579f3a5946432dee409b0e05222c6.1466184592.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 +++++++++++----------- kernel/sched/debug.c | 4 ++-- kernel/sched/fair.c | 42 +++++++++++++++++++++--------------------- kernel/sched/idle_task.c | 2 +- kernel/sched/stats.h | 22 +++++++++++----------- 5 files changed, 46 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 90b1961f6ea5..850677049d4a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1636,16 +1636,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) int this_cpu = smp_processor_id(); if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - schedstat_inc(p, se.statistics.nr_wakeups_local); + schedstat_inc(rq->ttwu_local); + schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p, se.statistics.nr_wakeups_remote); + schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); for_each_domain(this_cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); + schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1653,15 +1653,15 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); + schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq, ttwu_count); - schedstat_inc(p, se.statistics.nr_wakeups); + schedstat_inc(rq->ttwu_count); + schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); + schedstat_inc(p->se.statistics.nr_wakeups_sync); #endif /* CONFIG_SCHEDSTATS */ } @@ -3237,7 +3237,7 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_count); + schedstat_inc(this_rq()->sched_count); } /* @@ -4849,7 +4849,7 @@ SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); /* @@ -5000,7 +5000,7 @@ again: yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); /* * Make p's CPU reschedule; pick_next_entity takes care of * fairness. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..92fa53457b72 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -429,9 +429,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), + SPLIT_NS(schedstat_val(p->se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val(p->se.statistics.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 479639f6dc80..157d741cec34 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -800,7 +800,7 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); + schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); update_min_vruntime(cfs_rq); @@ -3275,7 +3275,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) d = -d; if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq, nr_spread_over); + schedstat_inc(cfs_rq->nr_spread_over); #endif } @@ -5164,13 +5164,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, balanced = this_eff_load <= prev_eff_load; - schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); + schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (!balanced) return 0; - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); return 1; } @@ -6183,7 +6183,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { int cpu; - schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->se.statistics.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -6214,7 +6214,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { - schedstat_inc(p, se.statistics.nr_failed_migrations_running); + schedstat_inc(p->se.statistics.nr_failed_migrations_running); return 0; } @@ -6231,13 +6231,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->se.statistics.nr_forced_migrations); } return 1; } - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->se.statistics.nr_failed_migrations_hot); return 0; } @@ -6277,7 +6277,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) * so we can safely collect stats here rather than * inside detach_tasks(). */ - schedstat_inc(env->sd, lb_gained[env->idle]); + schedstat_inc(env->sd->lb_gained[env->idle]); return p; } return NULL; @@ -6369,7 +6369,7 @@ next: * so we can safely collect detach_one_task() stats here rather * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], detached); + schedstat_add(env->sd->lb_gained[env->idle], detached); return detached; } @@ -7510,7 +7510,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - schedstat_inc(sd, lb_count[idle]); + schedstat_inc(sd->lb_count[idle]); redo: if (!should_we_balance(&env)) { @@ -7520,19 +7520,19 @@ redo: group = find_busiest_group(&env); if (!group) { - schedstat_inc(sd, lb_nobusyg[idle]); + schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(&env, group); if (!busiest) { - schedstat_inc(sd, lb_nobusyq[idle]); + schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == env.dst_rq); - schedstat_add(sd, lb_imbalance[idle], env.imbalance); + schedstat_add(sd->lb_imbalance[idle], env.imbalance); env.src_cpu = busiest->cpu; env.src_rq = busiest; @@ -7639,7 +7639,7 @@ more_balance: } if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); + schedstat_inc(sd->lb_failed[idle]); /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very @@ -7722,7 +7722,7 @@ out_all_pinned: * we can't migrate them. Let the imbalance flag set so parent level * can try to migrate them. */ - schedstat_inc(sd, lb_balanced[idle]); + schedstat_inc(sd->lb_balanced[idle]); sd->nr_balance_failed = 0; @@ -7915,15 +7915,15 @@ static int active_load_balance_cpu_stop(void *data) .idle = CPU_IDLE, }; - schedstat_inc(sd, alb_count); + schedstat_inc(sd->alb_count); p = detach_one_task(&env); if (p) { - schedstat_inc(sd, alb_pushed); + schedstat_inc(sd->alb_pushed); /* Active balancing done, reset the failure counter. */ sd->nr_balance_failed = 0; } else { - schedstat_inc(sd, alb_failed); + schedstat_inc(sd->alb_failed); } } rcu_read_unlock(); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 2ce5458bbe1d..dedc81ecbb2e 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -28,7 +28,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie c { put_prev_task(rq, prev); - schedstat_inc(rq, sched_goidle); + schedstat_inc(rq->sched_goidle); return rq->idle; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 78955cbea31c..fc0542576a4a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,11 +29,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) -# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) -# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) -# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) ((schedstat_enabled()) ? (var) : 0) #else /* !CONFIG_SCHEDSTATS */ static inline void @@ -45,12 +45,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} -# define schedstat_enabled() 0 -# define schedstat_inc(rq, field) do { } while (0) -# define schedstat_add(rq, field, amt) do { } while (0) -# define schedstat_set(var, val) do { } while (0) -# define schedstat_val(rq, field) 0 -#endif +#define schedstat_enabled() 0 +#define schedstat_inc(var) do { } while (0) +#define schedstat_add(var, amt) do { } while (0) +#define schedstat_set(var, val) do { } while (0) +#define schedstat_val(var) 0 +#endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) -- cgit v1.2.3 From 20e1d4863bfa7152e98f94e5bcdda3e7db41d899 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jun 2016 12:43:25 -0500 Subject: sched/debug: Rename 'schedstat_val()' -> 'schedstat_val_or_zero()' The schedstat_val() macro's behavior is kind of surprising: when schedstat is runtime disabled, it returns zero. Rename it to schedstat_val_or_zero(). There's also a need for a similar macro which doesn't have the 'if (schedstat_enable())' check, to avoid doing the check twice. Create a new 'schedstat_val()' macro for that. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3bb1d2367d041fee333b0dde17171e709395b675.1466184592.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++-- kernel/sched/stats.h | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 92fa53457b72..63ffcaa5d57c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -429,9 +429,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val(p->se.statistics.wait_sum)), + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val(p->se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index fc0542576a4a..34659a853505 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -33,7 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -#define schedstat_val(var) ((schedstat_enabled()) ? (var) : 0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #else /* !CONFIG_SCHEDSTATS */ static inline void @@ -50,6 +51,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) #define schedstat_add(var, amt) do { } while (0) #define schedstat_set(var, val) do { } while (0) #define schedstat_val(var) 0 +#define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO -- cgit v1.2.3 From 4fa8d299b43a91f871f6d5b00dd5ab33d43bbc2c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jun 2016 12:43:26 -0500 Subject: sched/debug: Remove several CONFIG_SCHEDSTATS guards Clean up the sched code by removing several of the CONFIG_SCHEDSTATS guards, using schedstat_*() macros where needed. Code size: !CONFIG_SCHEDSTATS defconfig: text data bss dec hex filename 10209818 4368184 1105920 15683922 ef5152 vmlinux.before.nostats 10209818 4368184 1105920 15683922 ef5152 vmlinux.after.nostats CONFIG_SCHEDSTATS defconfig: text data bss dec hex filename 10214210 4370040 1105920 15690170 ef69ba vmlinux.before.stats 10214210 4370680 1105920 15690810 ef6c3a vmlinux.after.stats Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e51e0ebe5af95ac295de720dd252e7c0d2142e4a.1466184592.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 33 +++++------- kernel/sched/debug.c | 99 ++++++++++++++++++---------------- kernel/sched/fair.c | 148 ++++++++++++++++++++++++--------------------------- 3 files changed, 136 insertions(+), 144 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 850677049d4a..860070fba814 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1629,13 +1629,15 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); + struct rq *rq; -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); + if (!schedstat_enabled()) + return; + + rq = this_rq(); - if (cpu == this_cpu) { +#ifdef CONFIG_SMP + if (cpu == rq->cpu) { schedstat_inc(rq->ttwu_local); schedstat_inc(p->se.statistics.nr_wakeups_local); } else { @@ -1643,7 +1645,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); - for_each_domain(this_cpu, sd) { + for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd->ttwu_wake_remote); break; @@ -1654,7 +1656,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_MIGRATED) schedstat_inc(p->se.statistics.nr_wakeups_migrate); - #endif /* CONFIG_SMP */ schedstat_inc(rq->ttwu_count); @@ -1662,8 +1663,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_SYNC) schedstat_inc(p->se.statistics.nr_wakeups_sync); - -#endif /* CONFIG_SCHEDSTATS */ } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2084,8 +2083,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu, wake_flags); stat: - if (schedstat_enabled()) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2134,8 +2132,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0, cookie); - if (schedstat_enabled()) - ttwu_stat(p, smp_processor_id(), 0); + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -7675,12 +7672,10 @@ void normalize_rt_tasks(void) if (p->flags & PF_KTHREAD) continue; - p->se.exec_start = 0; -#ifdef CONFIG_SCHEDSTATS - p->se.statistics.wait_start = 0; - p->se.statistics.sleep_start = 0; - p->se.statistics.block_start = 0; -#endif + p->se.exec_start = 0; + schedstat_set(p->se.statistics.wait_start, 0); + schedstat_set(p->se.statistics.sleep_start, 0); + schedstat_set(p->se.statistics.block_start, 0); if (!dl_task(p) && !rt_task(p)) { /* diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 63ffcaa5d57c..13935886a471 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + PN_SCHEDSTAT(se->statistics.wait_start); + PN_SCHEDSTAT(se->statistics.sleep_start); + PN_SCHEDSTAT(se->statistics.block_start); + PN_SCHEDSTAT(se->statistics.sleep_max); + PN_SCHEDSTAT(se->statistics.block_max); + PN_SCHEDSTAT(se->statistics.exec_max); + PN_SCHEDSTAT(se->statistics.slice_max); + PN_SCHEDSTAT(se->statistics.wait_max); + PN_SCHEDSTAT(se->statistics.wait_sum); + P_SCHEDSTAT(se->statistics.wait_count); } -#endif P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); #endif + +#undef PN_SCHEDSTAT #undef PN +#undef P_SCHEDSTAT #undef P } #endif @@ -626,9 +631,7 @@ do { \ #undef P64 #endif -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { P(yld_count); P(sched_count); @@ -636,9 +639,8 @@ do { \ P(ttwu_count); P(ttwu_local); } - #undef P -#endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); @@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) #define __PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) PN(se.exec_start); PN(se.vruntime); @@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_migrations); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); + PN_SCHEDSTAT(se.statistics.wait_start); + PN_SCHEDSTAT(se.statistics.sleep_start); + PN_SCHEDSTAT(se.statistics.block_start); + PN_SCHEDSTAT(se.statistics.sleep_max); + PN_SCHEDSTAT(se.statistics.block_max); + PN_SCHEDSTAT(se.statistics.exec_max); + PN_SCHEDSTAT(se.statistics.slice_max); + PN_SCHEDSTAT(se.statistics.wait_max); + PN_SCHEDSTAT(se.statistics.wait_sum); + P_SCHEDSTAT(se.statistics.wait_count); + PN_SCHEDSTAT(se.statistics.iowait_sum); + P_SCHEDSTAT(se.statistics.iowait_count); + P_SCHEDSTAT(se.statistics.nr_migrations_cold); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); + P_SCHEDSTAT(se.statistics.nr_forced_migrations); + P_SCHEDSTAT(se.statistics.nr_wakeups); + P_SCHEDSTAT(se.statistics.nr_wakeups_sync); + P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); + P_SCHEDSTAT(se.statistics.nr_wakeups_local); + P_SCHEDSTAT(se.statistics.nr_wakeups_remote); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); + P_SCHEDSTAT(se.statistics.nr_wakeups_passive); + P_SCHEDSTAT(se.statistics.nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) __PN(avg_atom); __PN(avg_per_cpu); } -#endif + __P(nr_switches); SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); @@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #endif P(policy); P(prio); +#undef PN_SCHEDSTAT #undef PN #undef __PN +#undef P_SCHEDSTAT #undef P #undef __P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 157d741cec34..a6820b3771e2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -821,26 +821,34 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } -#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start = rq_clock(rq_of(cfs_rq)); + u64 wait_start, prev_wait_start; + + if (!schedstat_enabled()) + return; + + wait_start = rq_clock(rq_of(cfs_rq)); + prev_wait_start = schedstat_val(se->statistics.wait_start); if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > se->statistics.wait_start)) - wait_start -= se->statistics.wait_start; + likely(wait_start > prev_wait_start)) + wait_start -= prev_wait_start; - se->statistics.wait_start = wait_start; + schedstat_set(se->statistics.wait_start, wait_start); } -static void +static inline void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *p; u64 delta; - delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + if (!schedstat_enabled()) + return; + + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { p = task_of(se); @@ -850,59 +858,67 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - se->statistics.wait_start = delta; + schedstat_set(se->statistics.wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - se->statistics.wait_max = max(se->statistics.wait_max, delta); - se->statistics.wait_count++; - se->statistics.wait_sum += delta; - se->statistics.wait_start = 0; + schedstat_set(se->statistics.wait_max, + max(schedstat_val(se->statistics.wait_max), delta)); + schedstat_inc(se->statistics.wait_count); + schedstat_add(se->statistics.wait_sum, delta); + schedstat_set(se->statistics.wait_start, 0); } -static void +static inline void update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *tsk = NULL; + u64 sleep_start, block_start; + + if (!schedstat_enabled()) + return; + + sleep_start = schedstat_val(se->statistics.sleep_start); + block_start = schedstat_val(se->statistics.block_start); if (entity_is_task(se)) tsk = task_of(se); - if (se->statistics.sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; + if (sleep_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->statistics.sleep_max)) - se->statistics.sleep_max = delta; + if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) + schedstat_set(se->statistics.sleep_max, delta); - se->statistics.sleep_start = 0; - se->statistics.sum_sleep_runtime += delta; + schedstat_set(se->statistics.sleep_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); trace_sched_stat_sleep(tsk, delta); } } - if (se->statistics.block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; + if (block_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->statistics.block_max)) - se->statistics.block_max = delta; + if (unlikely(delta > schedstat_val(se->statistics.block_max))) + schedstat_set(se->statistics.block_max, delta); - se->statistics.block_start = 0; - se->statistics.sum_sleep_runtime += delta; + schedstat_set(se->statistics.block_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { if (tsk->in_iowait) { - se->statistics.iowait_sum += delta; - se->statistics.iowait_count++; + schedstat_add(se->statistics.iowait_sum, delta); + schedstat_inc(se->statistics.iowait_count); trace_sched_stat_iowait(tsk, delta); } @@ -929,6 +945,9 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + if (!schedstat_enabled()) + return; + /* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) @@ -943,6 +962,10 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + + if (!schedstat_enabled()) + return; + /* * Mark the end of the wait period if dequeueing a * waiting task: @@ -950,45 +973,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + struct task_struct *tsk = task_of(se); - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } + if (tsk->state & TASK_INTERRUPTIBLE) + schedstat_set(se->statistics.sleep_start, + rq_clock(rq_of(cfs_rq))); + if (tsk->state & TASK_UNINTERRUPTIBLE) + schedstat_set(se->statistics.block_start, + rq_clock(rq_of(cfs_rq))); } - -} -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ } -static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ -} -#endif - /* * We are picking a new current task - update its stats: */ @@ -3396,10 +3392,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) place_entity(cfs_rq, se, 0); check_schedstat_required(); - if (schedstat_enabled()) { - update_stats_enqueue(cfs_rq, se, flags); - check_spread(cfs_rq, se); - } + update_stats_enqueue(cfs_rq, se, flags); + check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3466,8 +3460,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - if (schedstat_enabled()) - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3541,25 +3534,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - if (schedstat_enabled()) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; -#ifdef CONFIG_SCHEDSTATS + /* * Track our maximum slice length, if the CPU's load is at * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->statistics.slice_max = max(se->statistics.slice_max, - se->sum_exec_runtime - se->prev_sum_exec_runtime); + schedstat_set(se->statistics.slice_max, + max((u64)schedstat_val(se->statistics.slice_max), + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } -#endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -3638,13 +3631,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - if (schedstat_enabled()) { - check_spread(cfs_rq, prev); - if (prev->on_rq) - update_stats_wait_start(cfs_rq, prev); - } + check_spread(cfs_rq, prev); if (prev->on_rq) { + update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ -- cgit v1.2.3 From 47ae4b05d0fa2f2a998ebaf34d2dcbffca56a9db Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Aug 2016 08:48:43 +0200 Subject: virt, sched: Add generic vCPU pinning support Add generic virtualization support for pinning the current vCPU to a specified physical CPU. As this operation isn't performance critical (a very limited set of operations like BIOS calls and SMIs is expected to need this) just add a hypervisor specific indirection. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas_Warzecha@dell.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: david.vrabel@citrix.com Cc: hpa@zytor.com Cc: jdelvare@suse.com Cc: jeremy@goop.org Cc: linux@roeck-us.net Cc: pali.rohar@gmail.com Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1472453327-19050-3-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- kernel/smp.c | 1 + kernel/up.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 3aa642d39c03..4274ca5f3bbc 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "smpboot.h" diff --git a/kernel/up.c b/kernel/up.c index 1760bf3d1463..3ccee2bd13ba 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -6,6 +6,7 @@ #include #include #include +#include int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) -- cgit v1.2.3 From df8ce9d78a4e7fbe7ddfd8ccee3ecaaa0013e883 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Aug 2016 08:48:44 +0200 Subject: smp: Add function to execute a function synchronously on a CPU On some hardware models (e.g. Dell Studio 1555 laptop) some hardware related functions (e.g. SMIs) are to be executed on physical CPU 0 only. Instead of open coding such a functionality multiple times in the kernel add a service function for this purpose. This will enable the possibility to take special measures in virtualized environments like Xen, too. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas_Warzecha@dell.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: boris.ostrovsky@oracle.com Cc: chrisw@sous-sol.org Cc: david.vrabel@citrix.com Cc: hpa@zytor.com Cc: jdelvare@suse.com Cc: jeremy@goop.org Cc: linux@roeck-us.net Cc: pali.rohar@gmail.com Cc: rusty@rustcorp.com.au Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1472453327-19050-4-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- kernel/smp.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/up.c | 17 +++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4274ca5f3bbc..f4f6137941cb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -725,3 +725,53 @@ void wake_up_all_idle_cpus(void) preempt_enable(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); + +/** + * smp_call_on_cpu - Call a function on a specific cpu + * + * Used to call a function on a specific cpu and wait for it to return. + * Optionally make sure the call is done on a specified physical cpu via vcpu + * pinning in order to support virtualized environments. + */ +struct smp_call_on_cpu_struct { + struct work_struct work; + struct completion done; + int (*func)(void *); + void *data; + int ret; + int cpu; +}; + +static void smp_call_on_cpu_callback(struct work_struct *work) +{ + struct smp_call_on_cpu_struct *sscs; + + sscs = container_of(work, struct smp_call_on_cpu_struct, work); + if (sscs->cpu >= 0) + hypervisor_pin_vcpu(sscs->cpu); + sscs->ret = sscs->func(sscs->data); + if (sscs->cpu >= 0) + hypervisor_pin_vcpu(-1); + + complete(&sscs->done); +} + +int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) +{ + struct smp_call_on_cpu_struct sscs = { + .work = __WORK_INITIALIZER(sscs.work, smp_call_on_cpu_callback), + .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), + .func = func, + .data = par, + .cpu = phys ? cpu : -1, + }; + + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; + + queue_work_on(cpu, system_wq, &sscs.work); + wait_for_completion(&sscs.done); + + return sscs.ret; +} +EXPORT_SYMBOL_GPL(smp_call_on_cpu); diff --git a/kernel/up.c b/kernel/up.c index 3ccee2bd13ba..ee81ac9af4ca 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -83,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond); + +int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) +{ + int ret; + + if (cpu != 0) + return -ENXIO; + + if (phys) + hypervisor_pin_vcpu(0); + ret = func(par); + if (phys) + hypervisor_pin_vcpu(-1); + + return ret; +} +EXPORT_SYMBOL_GPL(smp_call_on_cpu); -- cgit v1.2.3 From c86d06ba2818c5126078cb0cf4e0175ec381045b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Sep 2016 08:38:13 -0400 Subject: PM / QoS: avoid calling cancel_delayed_work_sync() during early boot of_clk_init() ends up calling into pm_qos_update_request() very early during boot where irq is expected to stay disabled. pm_qos_update_request() uses cancel_delayed_work_sync() which correctly assumes that irq is enabled on invocation and unconditionally disables and re-enables it. Gate cancel_delayed_work_sync() invocation with kevented_up() to avoid enabling irq unexpectedly during early boot. Signed-off-by: Tejun Heo Reported-and-tested-by: Qiao Zhou Link: http://lkml.kernel.org/r/d2501c4c-8e7b-bea3-1b01-000b36b5dfe9@asrmicro.com Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..168ff442ebde 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - cancel_delayed_work_sync(&req->work); + /* + * This function may be called very early during boot, for example, + * from of_clk_init(), where irq needs to stay disabled. + * cancel_delayed_work_sync() assumes that irq is enabled on + * invocation and re-enables it on return. Avoid calling it until + * workqueue is initialized. + */ + if (keventd_up()) + cancel_delayed_work_sync(&req->work); + __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); -- cgit v1.2.3 From 3c1627e999e45e292d5d7ea3751ed86a6383ee2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Sep 2016 15:28:36 +0200 Subject: cpu/hotplug: Replace anon union Some compilers are unhappy with the anon union in the state array. Replace it with a named union. While at it align the state array initializers proper and add the missing name tags. Fixes: cf392d10b69e "cpu/hotplug: Add multi instance support" Reported-by: Ingo Molnar Reported-by: Fenguang Wu Signed-off-by: Thomas Gleixner Cc: rt@linutronix.de --- kernel/cpu.c | 145 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 74 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index c90f839c5b86..2409ed717a3f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -73,15 +73,15 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); struct cpuhp_step { const char *name; union { - int (*startup)(unsigned int cpu); - int (*startup_multi)(unsigned int cpu, - struct hlist_node *node); - }; + int (*single)(unsigned int cpu); + int (*multi)(unsigned int cpu, + struct hlist_node *node); + } startup; union { - int (*teardown)(unsigned int cpu); - int (*teardown_multi)(unsigned int cpu, - struct hlist_node *node); - }; + int (*single)(unsigned int cpu); + int (*multi)(unsigned int cpu, + struct hlist_node *node); + } teardown; struct hlist_head list; bool skip_onerr; bool cant_stop; @@ -127,7 +127,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int ret, cnt; if (!step->multi_instance) { - cb = bringup ? step->startup : step->teardown; + cb = bringup ? step->startup.single : step->teardown.single; if (!cb) return 0; trace_cpuhp_enter(cpu, st->target, state, cb); @@ -135,7 +135,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, trace_cpuhp_exit(cpu, st->state, state, ret); return ret; } - cbm = bringup ? step->startup_multi : step->teardown_multi; + cbm = bringup ? step->startup.multi : step->teardown.multi; if (!cbm) return 0; @@ -160,7 +160,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, return 0; err: /* Rollback the instances if one failed */ - cbm = !bringup ? step->startup_multi : step->teardown_multi; + cbm = !bringup ? step->startup.multi : step->teardown.multi; if (!cbm) return ret; @@ -1256,40 +1256,40 @@ core_initcall(cpu_hotplug_pm_sync_init); static struct cpuhp_step cpuhp_bp_states[] = { [CPUHP_OFFLINE] = { .name = "offline", - .startup = NULL, - .teardown = NULL, + .startup.single = NULL, + .teardown.single = NULL, }, #ifdef CONFIG_SMP [CPUHP_CREATE_THREADS]= { .name = "threads:create", - .startup = smpboot_create_threads, - .teardown = NULL, + .startup.single = smpboot_create_threads, + .teardown.single = NULL, .cant_stop = true, }, [CPUHP_PERF_PREPARE] = { - .name = "perf prepare", - .startup = perf_event_init_cpu, - .teardown = perf_event_exit_cpu, + .name = "perf:prepare", + .startup.single = perf_event_init_cpu, + .teardown.single = perf_event_exit_cpu, }, [CPUHP_WORKQUEUE_PREP] = { - .name = "workqueue prepare", - .startup = workqueue_prepare_cpu, - .teardown = NULL, + .name = "workqueue:prepare", + .startup.single = workqueue_prepare_cpu, + .teardown.single = NULL, }, [CPUHP_HRTIMERS_PREPARE] = { - .name = "hrtimers prepare", - .startup = hrtimers_prepare_cpu, - .teardown = hrtimers_dead_cpu, + .name = "hrtimers:prepare", + .startup.single = hrtimers_prepare_cpu, + .teardown.single = hrtimers_dead_cpu, }, [CPUHP_SMPCFD_PREPARE] = { - .name = "SMPCFD prepare", - .startup = smpcfd_prepare_cpu, - .teardown = smpcfd_dead_cpu, + .name = "SMPCFD:prepare", + .startup.single = smpcfd_prepare_cpu, + .teardown.single = smpcfd_dead_cpu, }, [CPUHP_RCUTREE_PREP] = { - .name = "RCU-tree prepare", - .startup = rcutree_prepare_cpu, - .teardown = rcutree_dead_cpu, + .name = "RCU-tree:prepare", + .startup.single = rcutree_prepare_cpu, + .teardown.single = rcutree_dead_cpu, }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers @@ -1297,8 +1297,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { */ [CPUHP_NOTIFY_PREPARE] = { .name = "notify:prepare", - .startup = notify_prepare, - .teardown = notify_dead, + .startup.single = notify_prepare, + .teardown.single = notify_dead, .skip_onerr = true, .cant_stop = true, }, @@ -1308,20 +1308,21 @@ static struct cpuhp_step cpuhp_bp_states[] = { * otherwise a RCU stall occurs. */ [CPUHP_TIMERS_DEAD] = { - .name = "timers dead", - .startup = NULL, - .teardown = timers_dead_cpu, + .name = "timers:dead", + .startup.single = NULL, + .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", - .startup = bringup_cpu, - .teardown = NULL, + .startup.single = bringup_cpu, + .teardown.single = NULL, .cant_stop = true, }, [CPUHP_AP_SMPCFD_DYING] = { - .startup = NULL, - .teardown = smpcfd_dying_cpu, + .name = "SMPCFD:dying", + .startup.single = NULL, + .teardown.single = smpcfd_dying_cpu, }, /* * Handled on controll processor until the plugged processor manages @@ -1329,8 +1330,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { */ [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", - .startup = NULL, - .teardown = takedown_cpu, + .startup.single = NULL, + .teardown.single = takedown_cpu, .cant_stop = true, }, #else @@ -1356,22 +1357,23 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* First state is scheduler control. Interrupts are disabled */ [CPUHP_AP_SCHED_STARTING] = { .name = "sched:starting", - .startup = sched_cpu_starting, - .teardown = sched_cpu_dying, + .startup.single = sched_cpu_starting, + .teardown.single = sched_cpu_dying, }, [CPUHP_AP_RCUTREE_DYING] = { - .startup = NULL, - .teardown = rcutree_dying_cpu, + .name = "RCU-tree:dying", + .startup.single = NULL, + .teardown.single = rcutree_dying_cpu, }, /* - * Low level startup/teardown notifiers. Run with interrupts + * Low level startup.single/teardown notifiers. Run with interrupts * disabled. Will be removed once the notifiers are converted to * states. */ [CPUHP_AP_NOTIFY_STARTING] = { .name = "notify:starting", - .startup = notify_starting, - .teardown = notify_dying, + .startup.single = notify_starting, + .teardown.single = notify_dying, .skip_onerr = true, .cant_stop = true, }, @@ -1383,23 +1385,23 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot:threads", - .startup = smpboot_unpark_threads, - .teardown = NULL, + .startup.single = smpboot_unpark_threads, + .teardown.single = NULL, }, [CPUHP_AP_PERF_ONLINE] = { - .name = "perf online", - .startup = perf_event_init_cpu, - .teardown = perf_event_exit_cpu, + .name = "perf:online", + .startup.single = perf_event_init_cpu, + .teardown.single = perf_event_exit_cpu, }, [CPUHP_AP_WORKQUEUE_ONLINE] = { - .name = "workqueue online", - .startup = workqueue_online_cpu, - .teardown = workqueue_offline_cpu, + .name = "workqueue:online", + .startup.single = workqueue_online_cpu, + .teardown.single = workqueue_offline_cpu, }, [CPUHP_AP_RCUTREE_ONLINE] = { - .name = "RCU-tree online", - .startup = rcutree_online_cpu, - .teardown = rcutree_offline_cpu, + .name = "RCU-tree:online", + .startup.single = rcutree_online_cpu, + .teardown.single = rcutree_offline_cpu, }, /* @@ -1408,8 +1410,8 @@ static struct cpuhp_step cpuhp_ap_states[] = { */ [CPUHP_AP_NOTIFY_ONLINE] = { .name = "notify:online", - .startup = notify_online, - .teardown = notify_down_prepare, + .startup.single = notify_online, + .teardown.single = notify_down_prepare, .skip_onerr = true, }, #endif @@ -1421,16 +1423,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* Last state is scheduler control setting the cpu active */ [CPUHP_AP_ACTIVE] = { .name = "sched:active", - .startup = sched_cpu_activate, - .teardown = sched_cpu_deactivate, + .startup.single = sched_cpu_activate, + .teardown.single = sched_cpu_deactivate, }, #endif /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", - .startup = NULL, - .teardown = NULL, + .startup.single = NULL, + .teardown.single = NULL, }, }; @@ -1453,8 +1455,8 @@ static void cpuhp_store_callbacks(enum cpuhp_state state, mutex_lock(&cpuhp_state_mutex); sp = cpuhp_get_step(state); - sp->startup = startup; - sp->teardown = teardown; + sp->startup.single = startup; + sp->teardown.single = teardown; sp->name = name; sp->multi_instance = multi_instance; INIT_HLIST_HEAD(&sp->list); @@ -1463,7 +1465,7 @@ static void cpuhp_store_callbacks(enum cpuhp_state state, static void *cpuhp_get_teardown_cb(enum cpuhp_state state) { - return cpuhp_get_step(state)->teardown; + return cpuhp_get_step(state)->teardown.single; } /* @@ -1476,7 +1478,8 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct cpuhp_step *sp = cpuhp_get_step(state); int ret; - if ((bringup && !sp->startup) || (!bringup && !sp->teardown)) + if ((bringup && !sp->startup.single) || + (!bringup && !sp->teardown.single)) return 0; /* * The non AP bound callbacks can fail on bringup. On teardown @@ -1554,7 +1557,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, get_online_cpus(); - if (!invoke || !sp->startup_multi) + if (!invoke || !sp->startup.multi) goto add_node; /* @@ -1570,7 +1573,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, ret = cpuhp_issue_call(cpu, state, true, node); if (ret) { - if (sp->teardown_multi) + if (sp->teardown.multi) cpuhp_rollback_install(cpu, state, node); goto err; } -- cgit v1.2.3 From 545d5d657b720e9c4dc773265bb7e9d88e34b269 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 31 May 2016 13:56:48 +0100 Subject: genirq: Update stale comment for __irq_domain_add Commit 1bf4ddc46c5d ("irqdomain: Introduce irq_domain_create_{linear, tree}") introduced the use of fwnode_handle to identify the interrupt controller when calling __irq_domain_add but missed updating the kernel doc parameters for the function. Update this comment. While we are touching this code, also consolidate the declaration and assignment of of_node. Signed-off-by: Punit Agrawal Acked-by: Marc Zygnier Link: http://lkml.kernel.org/r/1464699409-23113-1-git-send-email-punit.agrawal@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f10cffe8aefb..8c0a0ae43521 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); /** * __irq_domain_add() - Allocate a new irq_domain data structure - * @of_node: optional device-tree node of the interrupt controller + * @fwnode: firmware node for the interrupt controller * @size: Size of linear map; 0 for radix mapping only * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no @@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, const struct irq_domain_ops *ops, void *host_data) { + struct device_node *of_node = to_of_node(fwnode); struct irq_domain *domain; - struct device_node *of_node; - - of_node = to_of_node(fwnode); domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); -- cgit v1.2.3 From e8b61b3f2c5d3ee7804766621c91f38737d38105 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 1 Jun 2016 10:43:29 +0200 Subject: futex: Add some more function commentry Add some more comments and reformat existing ones to kernel doc style. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Darren Hart Link: http://lkml.kernel.org/r/1464770609-30168-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/futex.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 46cb3a301bc1..2c4be467fecd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) #endif } -/* - * We hash on the keys returned from get_futex_key (see below). +/** + * hash_futex - Return the hash bucket in the global hash + * @key: Pointer to the futex key for which the hash is calculated + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket in the global hash. */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { @@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) return &futex_queues[hash & (futex_hashsize - 1)]; } -/* + +/** + * match_futex - Check whether two futex keys are equal + * @key1: Pointer to key1 + * @key2: Pointer to key2 + * * Return 1 if two futex_keys are equal, 0 otherwise. */ static inline int match_futex(union futex_key *key1, union futex_key *key2) -- cgit v1.2.3 From 00b992deaa08495ab958da5950c9ebbba27d0ddc Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Tue, 19 Jul 2016 15:54:08 +0600 Subject: genirq: No need to mask non trigger mode flags before __irq_set_trigger() Some callers of __irq_set_trigger() masks all flags except trigger mode flags. This is unnecessary, ase __irq_set_trigger() already does this before usage of flags. [ tglx: Moved the flag mask and adjusted comment. Removed the hunk in enable_percpu_irq() as it is required there ] Signed-off-by: Alexander Kuleshov Link: http://lkml.kernel.org/r/20160719095408.13778-1-kuleshovmail@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 1 - kernel/irq/manage.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 93c373a8b12b..e11e8afcf209 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) if (!desc) return -EINVAL; - type &= IRQ_TYPE_SENSE_MASK; ret = __irq_set_trigger(desc, type); irq_put_desc_busunlock(desc, flags); return ret; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 73a2b786b5e9..4908617dee28 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) return 0; } - flags &= IRQ_TYPE_SENSE_MASK; - if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); @@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) unmask = 1; } - /* caller masked out all except trigger mode flags */ + /* Mask all flags except trigger mode */ + flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { -- cgit v1.2.3 From 677f6646533d701c8609b8bcb9304173c11cc194 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 6 Sep 2016 16:13:48 +0200 Subject: cpu/hotplug: Make state names consistent We should have all names in the scheme "[subsys/]facility:state]". Fix the core to comply. Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 2409ed717a3f..32eef273a0b9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1261,7 +1261,7 @@ static struct cpuhp_step cpuhp_bp_states[] = { }, #ifdef CONFIG_SMP [CPUHP_CREATE_THREADS]= { - .name = "threads:create", + .name = "threads:prepare", .startup.single = smpboot_create_threads, .teardown.single = NULL, .cant_stop = true, @@ -1282,12 +1282,12 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown.single = hrtimers_dead_cpu, }, [CPUHP_SMPCFD_PREPARE] = { - .name = "SMPCFD:prepare", + .name = "smpcfd:prepare", .startup.single = smpcfd_prepare_cpu, .teardown.single = smpcfd_dead_cpu, }, [CPUHP_RCUTREE_PREP] = { - .name = "RCU-tree:prepare", + .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, .teardown.single = rcutree_dead_cpu, }, @@ -1320,7 +1320,7 @@ static struct cpuhp_step cpuhp_bp_states[] = { .cant_stop = true, }, [CPUHP_AP_SMPCFD_DYING] = { - .name = "SMPCFD:dying", + .name = "smpcfd:dying", .startup.single = NULL, .teardown.single = smpcfd_dying_cpu, }, @@ -1361,7 +1361,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { .teardown.single = sched_cpu_dying, }, [CPUHP_AP_RCUTREE_DYING] = { - .name = "RCU-tree:dying", + .name = "RCU/tree:dying", .startup.single = NULL, .teardown.single = rcutree_dying_cpu, }, @@ -1384,7 +1384,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { }, /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { - .name = "smpboot:threads", + .name = "smpboot/threads:online", .startup.single = smpboot_unpark_threads, .teardown.single = NULL, }, @@ -1399,7 +1399,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { .teardown.single = workqueue_offline_cpu, }, [CPUHP_AP_RCUTREE_ONLINE] = { - .name = "RCU-tree:online", + .name = "RCU/tree:online", .startup.single = rcutree_online_cpu, .teardown.single = rcutree_offline_cpu, }, -- cgit v1.2.3 From ee1e714b94521b0bb27b04dfd1728ec51b19d4f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 18 Aug 2016 14:57:16 +0200 Subject: cpu/hotplug: Remove CPU_STARTING and CPU_DYING notifier All users are converted to state machine, remove CPU_STARTING and the corresponding CPU_DYING. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160818125731.27256-2-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 32eef273a0b9..d14ae4438e8e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -408,12 +408,6 @@ static int notify_online(unsigned int cpu) return 0; } -static int notify_starting(unsigned int cpu) -{ - cpu_notify(CPU_STARTING, cpu); - return 0; -} - static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -759,12 +753,6 @@ static int notify_down_prepare(unsigned int cpu) return err; } -static int notify_dying(unsigned int cpu) -{ - cpu_notify(CPU_DYING, cpu); - return 0; -} - /* Take this CPU down. */ static int take_cpu_down(void *_param) { @@ -823,7 +811,7 @@ static int takedown_cpu(unsigned int cpu) BUG_ON(cpu_online(cpu)); /* - * The migration_call() CPU_DYING callback will have removed all + * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all * runnable tasks from the cpu, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * @@ -876,7 +864,6 @@ void cpuhp_report_idle_dead(void) #define notify_down_prepare NULL #define takedown_cpu NULL #define notify_dead NULL -#define notify_dying NULL #endif #ifdef CONFIG_HOTPLUG_CPU @@ -966,10 +953,9 @@ EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started * - * This function calls the cpu_chain notifiers with CPU_STARTING. * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ @@ -1365,18 +1351,6 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup.single = NULL, .teardown.single = rcutree_dying_cpu, }, - /* - * Low level startup.single/teardown notifiers. Run with interrupts - * disabled. Will be removed once the notifiers are converted to - * states. - */ - [CPUHP_AP_NOTIFY_STARTING] = { - .name = "notify:starting", - .startup.single = notify_starting, - .teardown.single = notify_dying, - .skip_onerr = true, - .cant_stop = true, - }, /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { -- cgit v1.2.3 From 017c59c042d01fc84cae7a8ea475861e702c77ab Mon Sep 17 00:00:00 2001 From: Akash Goel Date: Fri, 2 Sep 2016 21:47:38 +0200 Subject: relay: Use per CPU constructs for the relay channel buffer pointers relay essentially needs to maintain a per CPU array of channel buffer pointers but it manually creates that array. Instead its better to use the per CPU constructs, provided by the kernel, to allocate & access the array of pointer to channel buffers. Signed-off-by: Akash Goel Reviewed-by: Chris Wilson Link: http://lkml.kernel.org/r/1470909140-25919-1-git-send-email-akash.goel@intel.com Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/relay.c | 74 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index d797502140b9..ed157378f6cb 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) __free_page(buf->page_array[i]); relay_free_page_array(buf->page_array); } - chan->buf[buf->cpu] = NULL; + *per_cpu_ptr(chan->buf, buf->cpu) = NULL; kfree(buf->padding); kfree(buf); kref_put(&chan->kref, relay_destroy_channel); @@ -382,20 +382,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) */ void relay_reset(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; - if (chan->is_global && chan->buf[0]) { - __relay_reset(chan->buf[0], 0); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { + __relay_reset(buf, 0); return; } mutex_lock(&relay_channels_mutex); for_each_possible_cpu(i) - if (chan->buf[i]) - __relay_reset(chan->buf[i], 0); + if ((buf = *per_cpu_ptr(chan->buf, i))) + __relay_reset(buf, 0); mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_reset); @@ -440,7 +441,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) struct dentry *dentry; if (chan->is_global) - return chan->buf[0]; + return *per_cpu_ptr(chan->buf, 0); buf = relay_create_buf(chan); if (!buf) @@ -464,7 +465,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) __relay_reset(buf, 1); if(chan->is_global) { - chan->buf[0] = buf; + *per_cpu_ptr(chan->buf, 0) = buf; buf->cpu = 0; } @@ -526,22 +527,24 @@ static int relay_hotcpu_callback(struct notifier_block *nb, { unsigned int hotcpu = (unsigned long)hcpu; struct rchan *chan; + struct rchan_buf *buf; switch(action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&relay_channels_mutex); list_for_each_entry(chan, &relay_channels, list) { - if (chan->buf[hotcpu]) + if ((buf = *per_cpu_ptr(chan->buf, hotcpu))) continue; - chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); - if(!chan->buf[hotcpu]) { + buf = relay_open_buf(chan, hotcpu); + if (!buf) { printk(KERN_ERR "relay_hotcpu_callback: cpu %d buffer " "creation failed\n", hotcpu); mutex_unlock(&relay_channels_mutex); return notifier_from_errno(-ENOMEM); } + *per_cpu_ptr(chan->buf, hotcpu) = buf; } mutex_unlock(&relay_channels_mutex); break; @@ -583,6 +586,7 @@ struct rchan *relay_open(const char *base_filename, { unsigned int i; struct rchan *chan; + struct rchan_buf *buf; if (!(subbuf_size && n_subbufs)) return NULL; @@ -593,6 +597,7 @@ struct rchan *relay_open(const char *base_filename, if (!chan) return NULL; + chan->buf = alloc_percpu(struct rchan_buf *); chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; @@ -608,9 +613,10 @@ struct rchan *relay_open(const char *base_filename, mutex_lock(&relay_channels_mutex); for_each_online_cpu(i) { - chan->buf[i] = relay_open_buf(chan, i); - if (!chan->buf[i]) + buf = relay_open_buf(chan, i); + if (!buf) goto free_bufs; + *per_cpu_ptr(chan->buf, i) = buf; } list_add(&chan->list, &relay_channels); mutex_unlock(&relay_channels_mutex); @@ -619,8 +625,8 @@ struct rchan *relay_open(const char *base_filename, free_bufs: for_each_possible_cpu(i) { - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_close_buf(buf); } kref_put(&chan->kref, relay_destroy_channel); @@ -666,6 +672,7 @@ int relay_late_setup_files(struct rchan *chan, unsigned int i, curr_cpu; unsigned long flags; struct dentry *dentry; + struct rchan_buf *buf; struct rchan_percpu_buf_dispatcher disp; if (!chan || !base_filename) @@ -684,10 +691,11 @@ int relay_late_setup_files(struct rchan *chan, if (chan->is_global) { err = -EINVAL; - if (!WARN_ON_ONCE(!chan->buf[0])) { - dentry = relay_create_buf_file(chan, chan->buf[0], 0); + buf = *per_cpu_ptr(chan->buf, 0); + if (!WARN_ON_ONCE(!buf)) { + dentry = relay_create_buf_file(chan, buf, 0); if (dentry && !WARN_ON_ONCE(!chan->is_global)) { - relay_set_buf_dentry(chan->buf[0], dentry); + relay_set_buf_dentry(buf, dentry); err = 0; } } @@ -702,13 +710,14 @@ int relay_late_setup_files(struct rchan *chan, * on all currently online CPUs. */ for_each_online_cpu(i) { - if (unlikely(!chan->buf[i])) { + buf = *per_cpu_ptr(chan->buf, i); + if (unlikely(!buf)) { WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); err = -EINVAL; break; } - dentry = relay_create_buf_file(chan, chan->buf[i], i); + dentry = relay_create_buf_file(chan, buf, i); if (unlikely(!dentry)) { err = -EINVAL; break; @@ -716,10 +725,10 @@ int relay_late_setup_files(struct rchan *chan, if (curr_cpu == i) { local_irq_save(flags); - relay_set_buf_dentry(chan->buf[i], dentry); + relay_set_buf_dentry(buf, dentry); local_irq_restore(flags); } else { - disp.buf = chan->buf[i]; + disp.buf = buf; disp.dentry = dentry; smp_mb(); /* relay_channels_mutex must be held, so wait. */ @@ -822,11 +831,10 @@ void relay_subbufs_consumed(struct rchan *chan, if (!chan) return; - if (cpu >= NR_CPUS || !chan->buf[cpu] || - subbufs_consumed > chan->n_subbufs) + buf = *per_cpu_ptr(chan->buf, cpu); + if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) return; - buf = chan->buf[cpu]; if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) buf->subbufs_consumed = buf->subbufs_produced; else @@ -842,18 +850,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed); */ void relay_close(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; mutex_lock(&relay_channels_mutex); - if (chan->is_global && chan->buf[0]) - relay_close_buf(chan->buf[0]); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) + relay_close_buf(buf); else for_each_possible_cpu(i) - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_close_buf(buf); if (chan->last_toobig) printk(KERN_WARNING "relay: one or more items not logged " @@ -874,20 +883,21 @@ EXPORT_SYMBOL_GPL(relay_close); */ void relay_flush(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; - if (chan->is_global && chan->buf[0]) { - relay_switch_subbuf(chan->buf[0], 0); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { + relay_switch_subbuf(buf, 0); return; } mutex_lock(&relay_channels_mutex); for_each_possible_cpu(i) - if (chan->buf[i]) - relay_switch_subbuf(chan->buf[i], 0); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_switch_subbuf(buf, 0); mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_flush); -- cgit v1.2.3 From e6d4989a9ad1ccc343f29578a461612ed80fc6c5 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 18 Aug 2016 14:57:17 +0200 Subject: relayfs: Convert to hotplug state machine Install the callbacks via the state machine. They are installed at run time but relay_prepare_cpu() does not need to be invoked by the boot CPU because relay_open() was not yet invoked and there are no pools that need to be created. Signed-off-by: Richard Weinberger Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: rt@linutronix.de Cc: Andrew Morton Link: http://lkml.kernel.org/r/20160818125731.27256-3-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 6 ++++++ kernel/relay.c | 58 +++++++++++++--------------------------------------------- 2 files changed, 19 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d14ae4438e8e..0c0d4b2ddd1c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -1272,6 +1273,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup.single = smpcfd_prepare_cpu, .teardown.single = smpcfd_dead_cpu, }, + [CPUHP_RELAY_PREPARE] = { + .name = "relay:prepare", + .startup.single = relay_prepare_cpu, + .teardown.single = NULL, + }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, diff --git a/kernel/relay.c b/kernel/relay.c index ed157378f6cb..fc9b4a4af463 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -513,48 +513,25 @@ static void setup_callbacks(struct rchan *chan, chan->cb = cb; } -/** - * relay_hotcpu_callback - CPU hotplug callback - * @nb: notifier block - * @action: hotplug action to take - * @hcpu: CPU number - * - * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) - */ -static int relay_hotcpu_callback(struct notifier_block *nb, - unsigned long action, - void *hcpu) +int relay_prepare_cpu(unsigned int cpu) { - unsigned int hotcpu = (unsigned long)hcpu; struct rchan *chan; struct rchan_buf *buf; - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - mutex_lock(&relay_channels_mutex); - list_for_each_entry(chan, &relay_channels, list) { - if ((buf = *per_cpu_ptr(chan->buf, hotcpu))) - continue; - buf = relay_open_buf(chan, hotcpu); - if (!buf) { - printk(KERN_ERR - "relay_hotcpu_callback: cpu %d buffer " - "creation failed\n", hotcpu); - mutex_unlock(&relay_channels_mutex); - return notifier_from_errno(-ENOMEM); - } - *per_cpu_ptr(chan->buf, hotcpu) = buf; + mutex_lock(&relay_channels_mutex); + list_for_each_entry(chan, &relay_channels, list) { + if ((buf = *per_cpu_ptr(chan->buf, cpu))) + continue; + buf = relay_open_buf(chan, cpu); + if (!buf) { + pr_err("relay: cpu %d buffer creation failed\n", cpu); + mutex_unlock(&relay_channels_mutex); + return -ENOMEM; } - mutex_unlock(&relay_channels_mutex); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* No need to flush the cpu : will be flushed upon - * final relay_flush() call. */ - break; + *per_cpu_ptr(chan->buf, cpu) = buf; } - return NOTIFY_OK; + mutex_unlock(&relay_channels_mutex); + return 0; } /** @@ -1387,12 +1364,3 @@ const struct file_operations relay_file_operations = { .splice_read = relay_file_splice_read, }; EXPORT_SYMBOL_GPL(relay_file_operations); - -static __init int relay_init(void) -{ - - hotcpu_notifier(relay_hotcpu_callback, 0); - return 0; -} - -early_initcall(relay_init); -- cgit v1.2.3 From 6731d4f12315aed5f7eefc52dac30428e382d7d0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 23 Aug 2016 14:53:19 +0200 Subject: slab: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Richard Weinberger Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: rt@linutronix.de Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Christoph Lameter Link: http://lkml.kernel.org/r/20160823125319.abeapfjapf2kfezp@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 0c0d4b2ddd1c..7c783876cbcb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -1278,6 +1279,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup.single = relay_prepare_cpu, .teardown.single = NULL, }, + [CPUHP_SLAB_PREPARE] = { + .name = "slab:prepare", + .startup.single = slab_prepare_cpu, + .teardown.single = slab_dead_cpu, + }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, -- cgit v1.2.3 From c4544dbc7a9bce3da6fa2361cd68cadb34e9221f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Aug 2016 14:57:21 +0200 Subject: kernel/softirq: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160818125731.27256-7-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..c372114494f5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -700,7 +700,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) BUG(); } -static void takeover_tasklets(unsigned int cpu) +static int takeover_tasklets(unsigned int cpu) { /* CPU is dead, so no lock needed. */ local_irq_disable(); @@ -723,27 +723,12 @@ static void takeover_tasklets(unsigned int cpu) raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); + return 0; } +#else +#define takeover_tasklets NULL #endif /* CONFIG_HOTPLUG_CPU */ -static int cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - switch (action) { -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - takeover_tasklets((unsigned long)hcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_nfb = { - .notifier_call = cpu_callback -}; - static struct smp_hotplug_thread softirq_threads = { .store = &ksoftirqd, .thread_should_run = ksoftirqd_should_run, @@ -753,8 +738,8 @@ static struct smp_hotplug_thread softirq_threads = { static __init int spawn_ksoftirqd(void) { - register_cpu_notifier(&cpu_nfb); - + cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, + takeover_tasklets); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); return 0; -- cgit v1.2.3 From f1e4ba5b6a6555d1f6b174d4fd0a96a9363bb57f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 6 Sep 2016 15:10:22 +0200 Subject: perf, bpf: fix conditional call to bpf_overflow_handler The newly added bpf_overflow_handler function is only built of both CONFIG_EVENT_TRACING and CONFIG_BPF_SYSCALL are enabled, but the caller only checks the latter: kernel/events/core.c: In function 'perf_event_alloc': kernel/events/core.c:9106:27: error: 'bpf_overflow_handler' undeclared (first use in this function) This changes the caller so we also skip this call if CONFIG_EVENT_TRACING is disabled entirely. Signed-off-by: Arnd Bergmann Fixes: aa6a5f3cb2b2 ("perf, bpf: add perf events core support for BPF_PROG_TYPE_PERF_EVENT programs") Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 85bf4c37911f..a7b8c1c75fa7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9072,7 +9072,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; -#ifdef CONFIG_BPF_SYSCALL +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); -- cgit v1.2.3 From 2d2be8cab26ed918e94d2deae89580003242a123 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 8 Sep 2016 01:03:42 +0200 Subject: bpf: fix range propagation on direct packet access LLVM can generate code that tests for direct packet access via skb->data/data_end in a way that currently gets rejected by the verifier, example: [...] 7: (61) r3 = *(u32 *)(r6 +80) 8: (61) r9 = *(u32 *)(r6 +76) 9: (bf) r2 = r9 10: (07) r2 += 54 11: (3d) if r3 >= r2 goto pc+12 R1=inv R2=pkt(id=0,off=54,r=0) R3=pkt_end R4=inv R6=ctx R9=pkt(id=0,off=0,r=0) R10=fp 12: (18) r4 = 0xffffff7a 14: (05) goto pc+430 [...] from 11 to 24: R1=inv R2=pkt(id=0,off=54,r=0) R3=pkt_end R4=inv R6=ctx R9=pkt(id=0,off=0,r=0) R10=fp 24: (7b) *(u64 *)(r10 -40) = r1 25: (b7) r1 = 0 26: (63) *(u32 *)(r6 +56) = r1 27: (b7) r2 = 40 28: (71) r8 = *(u8 *)(r9 +20) invalid access to packet, off=20 size=1, R9(id=0,off=0,r=0) The reason why this gets rejected despite a proper test is that we currently call find_good_pkt_pointers() only in case where we detect tests like rX > pkt_end, where rX is of type pkt(id=Y,off=Z,r=0) and derived, for example, from a register of type pkt(id=Y,off=0,r=0) pointing to skb->data. find_good_pkt_pointers() then fills the range in the current branch to pkt(id=Y,off=0,r=Z) on success. For above case, we need to extend that to recognize pkt_end >= rX pattern and mark the other branch that is taken on success with the appropriate pkt(id=Y,off=0,r=Z) type via find_good_pkt_pointers(). Since eBPF operates on BPF_JGT (>) and BPF_JGE (>=), these are the only two practical options to test for from what LLVM could have generated, since there's no such thing as BPF_JLT (<) or BPF_JLE (<=) that we would need to take into account as well. After the fix: [...] 7: (61) r3 = *(u32 *)(r6 +80) 8: (61) r9 = *(u32 *)(r6 +76) 9: (bf) r2 = r9 10: (07) r2 += 54 11: (3d) if r3 >= r2 goto pc+12 R1=inv R2=pkt(id=0,off=54,r=0) R3=pkt_end R4=inv R6=ctx R9=pkt(id=0,off=0,r=0) R10=fp 12: (18) r4 = 0xffffff7a 14: (05) goto pc+430 [...] from 11 to 24: R1=inv R2=pkt(id=0,off=54,r=54) R3=pkt_end R4=inv R6=ctx R9=pkt(id=0,off=0,r=54) R10=fp 24: (7b) *(u64 *)(r10 -40) = r1 25: (b7) r1 = 0 26: (63) *(u32 *)(r6 +56) = r1 27: (b7) r2 = 40 28: (71) r8 = *(u8 *)(r9 +20) 29: (bf) r1 = r8 30: (25) if r8 > 0x3c goto pc+47 R1=inv56 R2=imm40 R3=pkt_end R4=inv R6=ctx R8=inv56 R9=pkt(id=0,off=0,r=54) R10=fp 31: (b7) r1 = 1 [...] Verifier test cases are also added in this work, one that demonstrates the mentioned example here and one that tries a bad packet access for the current/fall-through branch (the one with types pkt(id=X,off=Y,r=0), pkt(id=X,off=0,r=0)), then a case with good and bad accesses, and two with both test variants (>, >=). Fixes: 969bf05eb3ce ("bpf: direct packet access") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48c2705db22c..90493a66dddd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1637,21 +1637,42 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct verifier_env *env, - struct reg_state *dst_reg) +static void find_good_pkt_pointers(struct verifier_state *state, + const struct reg_state *dst_reg) { - struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs, *reg; int i; - /* r2 = r3; - * r2 += 8 - * if (r2 > pkt_end) goto somewhere - * r2 == dst_reg, pkt_end == src_reg, - * r2=pkt(id=n,off=8,r=0) - * r3=pkt(id=n,off=0,r=0) - * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) - * so that range of bytes [r3, r3 + 8) is safe to access + + /* LLVM can generate two kind of checks: + * + * Type 1: + * + * r2 = r3; + * r2 += 8; + * if (r2 > pkt_end) goto + * + * + * Where: + * r2 == dst_reg, pkt_end == src_reg + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * + * Type 2: + * + * r2 = r3; + * r2 += 8; + * if (pkt_end >= r2) goto + * + * + * Where: + * pkt_end == dst_reg, r2 == src_reg + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * + * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) + * so that range of bytes [r3, r3 + 8) is safe to access. */ + for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) regs[i].range = dst_reg->off; @@ -1668,8 +1689,8 @@ static void find_good_pkt_pointers(struct verifier_env *env, static int check_cond_jmp_op(struct verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct reg_state *regs = env->cur_state.regs, *dst_reg; - struct verifier_state *other_branch; + struct verifier_state *other_branch, *this_branch = &env->cur_state; + struct reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1750,13 +1771,17 @@ static int check_cond_jmp_op(struct verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(env, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; } if (log_level) - print_verifier_state(&env->cur_state); + print_verifier_state(this_branch); return 0; } -- cgit v1.2.3 From 9049771f7d5490a302589976984810064c83ab40 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Sep 2016 08:51:21 -0700 Subject: mm: fix cache mode of dax pmd mappings track_pfn_insert() in vmf_insert_pfn_pmd() is marking dax mappings as uncacheable rendering them impractical for application usage. DAX-pte mappings are cached and the goal of establishing DAX-pmd mappings is to attain more performance, not dramatically less (3 orders of magnitude). track_pfn_insert() relies on a previous call to reserve_memtype() to establish the expected page_cache_mode for the range. While memremap() arranges for reserve_memtype() to be called, devm_memremap_pages() does not. So, teach track_pfn_insert() and untrack_pfn() how to handle tracking without a vma, and arrange for devm_memremap_pages() to establish the write-back-cache reservation in the memtype tree. Cc: Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Nilesh Choudhury Cc: Kirill A. Shutemov Reported-by: Toshi Kani Reported-by: Kai Zhang Acked-by: Andrew Morton Signed-off-by: Dan Williams --- kernel/memremap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 251d16b4cb41..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); @@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { resource_size_t key, align_start, align_size, align_end; + pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; @@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, + align_size); + if (error) + goto err_pfn_remap; + error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return __va(res->start); err_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + err_pfn_remap: err_radix: pgmap_radix_release(res); devres_free(page_map); -- cgit v1.2.3 From 6088b5823b4cb132a838878747384cbfb5ce6646 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:28 +0200 Subject: bpf: minor cleanups in helpers Some minor misc cleanups, f.e. use sizeof(__u32) instead of hardcoding and in __bpf_skb_max_len(), I missed that we always have skb->dev valid anyway, so we can drop the unneeded test for dev; also few more other misc bits addressed here. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/helpers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ea3afba1a4f..6df73bd1ba34 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -126,7 +126,7 @@ static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { struct task_struct *task = current; - if (!task) + if (unlikely(!task)) return -EINVAL; return (u64) task->tgid << 32 | task->pid; @@ -144,12 +144,12 @@ static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) kuid_t uid; kgid_t gid; - if (!task) + if (unlikely(!task)) return -EINVAL; current_uid_gid(&uid, &gid); return (u64) from_kgid(&init_user_ns, gid) << 32 | - from_kuid(&init_user_ns, uid); + from_kuid(&init_user_ns, uid); } const struct bpf_func_proto bpf_get_current_uid_gid_proto = { -- cgit v1.2.3 From f035a51536af9802f55d8c79bd87f184ebffb093 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:29 +0200 Subject: bpf: add BPF_SIZEOF and BPF_FIELD_SIZEOF macros Add BPF_SIZEOF() and BPF_FIELD_SIZEOF() macros to improve the code a bit which otherwise often result in overly long bytes_to_bpf_size(sizeof()) and bytes_to_bpf_size(FIELD_SIZEOF()) lines. So place them into a macro helper instead. Moreover, we currently have a BUILD_BUG_ON(BPF_FIELD_SIZEOF()) check in convert_bpf_extensions(), but we should rather make that generic as well and add a BUILD_BUG_ON() test in all BPF_SIZEOF()/BPF_FIELD_SIZEOF() users to detect any rewriter size issues at compile time. Note, there are currently none, but we want to assert that it stays this way. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d3869b03d9fe..e63d7d435796 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -583,18 +583,18 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, switch (ctx_off) { case offsetof(struct bpf_perf_event_data, sample_period): BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)), - dst_reg, src_reg, + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), dst_reg, src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, offsetof(struct perf_sample_data, period)); break; default: - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)), - dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + regs), dst_reg, src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)), - dst_reg, dst_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); break; } -- cgit v1.2.3 From f3694e00123802d688180e7ae90b240669910e3c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:31 +0200 Subject: bpf: add BPF_CALL_x macros for declaring helpers This work adds BPF_CALL_() macros and converts all the eBPF helper functions to use them, in a similar fashion like we do with SYSCALL_DEFINE() macros that are used today. Motivation for this is to hide all the register handling and all necessary casts from the user, so that it is done automatically in the background when adding a BPF_CALL_() call. This makes current helpers easier to review, eases to write future helpers, avoids getting the casting mess wrong, and allows for extending all helpers at once (f.e. build time checks, etc). It also helps detecting more easily in code reviews that unused registers are not instrumented in the code by accident, breaking compatibility with existing programs. BPF_CALL_() internals are quite similar to SYSCALL_DEFINE() ones with some fundamental differences, for example, for generating the actual helper function that carries all u64 regs, we need to fill unused regs, so that we always end up with 5 u64 regs as an argument. I reviewed several 0-5 generated BPF_CALL_() variants of the .i results and they look all as expected. No sparse issue spotted. We let this also sit for a few days with Fengguang's kbuild test robot, and there were no issues seen. On s390, it barked on the "uses dynamic stack allocation" notice, which is an old one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion to the call wrapper, just telling that the perf raw record/frag sits on stack (gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests and they were fine as well. All eBPF helpers are now converted to use these macros, getting rid of a good chunk of all the raw castings. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 2 +- kernel/bpf/helpers.c | 46 ++++++++--------------------- kernel/bpf/stackmap.c | 5 ++-- kernel/trace/bpf_trace.c | 75 +++++++++++++++++++++++------------------------- 4 files changed, 51 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 03fd23d4d587..7b7baaed9ed4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void) prandom_init_once(&bpf_user_rnd_state); } -u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_user_rnd_u32) { /* Should someone ever have the rather unwise idea to use some * of the registers passed into this function, then note that diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6df73bd1ba34..a5b8bf8cfcfd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -16,6 +16,7 @@ #include #include #include +#include /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -26,24 +27,10 @@ * if program is allowed to access maps, so check rcu_read_lock_held in * all three functions. */ -static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - /* verifier checked that R1 contains a valid pointer to bpf_map - * and R2 points to a program stack and map->key_size bytes were - * initialized - */ - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value; - WARN_ON_ONCE(!rcu_read_lock_held()); - - value = map->ops->map_lookup_elem(map, key); - - /* lookup() returns either pointer to element value or NULL - * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type - */ - return (unsigned long) value; + return (unsigned long) map->ops->map_lookup_elem(map, key); } const struct bpf_func_proto bpf_map_lookup_elem_proto = { @@ -54,15 +41,11 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; -static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, + void *, value, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value = (void *) (unsigned long) r3; - WARN_ON_ONCE(!rcu_read_lock_held()); - - return map->ops->map_update_elem(map, key, value, r4); + return map->ops->map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { @@ -75,13 +58,9 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - WARN_ON_ONCE(!rcu_read_lock_held()); - return map->ops->map_delete_elem(map, key); } @@ -99,7 +78,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_smp_processor_id) { return smp_processor_id(); } @@ -110,7 +89,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); @@ -122,7 +101,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -138,7 +117,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_uid_gid) { struct task_struct *task = current; kuid_t uid; @@ -158,10 +137,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) { struct task_struct *task = current; - char *buf = (char *) (long) r1; if (unlikely(!task)) goto err_clear; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index bf4495fcd25d..732ae16d12b7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -116,10 +116,9 @@ free_smap: return ERR_PTR(err); } -u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e63d7d435796..5dcb99281259 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -61,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); -static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { - void *dst = (void *) (long) r1; - int ret, size = (int) r2; - void *unsafe_ptr = (void *) (long) r3; + int ret; ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) @@ -83,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, + u32, size) { - void *unsafe_ptr = (void *) (long) r1; - void *src = (void *) (long) r2; - int size = (int) r3; - /* * Ensure we're in user context which is safe for the helper to * run. This helper has no business in a kthread. @@ -130,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ -static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { - char *fmt = (char *) (long) r1; bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; @@ -178,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) switch (fmt_cnt) { case 1: - unsafe_addr = r3; - r3 = (long) buf; + unsafe_addr = arg1; + arg1 = (long) buf; break; case 2: - unsafe_addr = r4; - r4 = (long) buf; + unsafe_addr = arg2; + arg2 = (long) buf; break; case 3: - unsafe_addr = r5; - r5 = (long) buf; + unsafe_addr = arg3; + arg3 = (long) buf; break; } buf[0] = 0; @@ -209,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) } return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, - mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, - mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); + mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, + mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, + mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -233,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; @@ -312,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, return 0; } -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { - struct pt_regs *regs = (struct pt_regs *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; - void *data = (void *)(long) r4; struct perf_raw_record raw = { .frag = { .size = size, @@ -367,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, return __bpf_perf_event_output(regs, map, flags, &raw); } -static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_task) { return (long) current; } @@ -378,16 +370,13 @@ static const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { - struct bpf_map *map = (struct bpf_map *)(long)r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; - u32 idx = (u32)r2; if (unlikely(in_interrupt())) return -EINVAL; - if (unlikely(idx >= array->map.max_entries)) return -E2BIG; @@ -481,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = { .type = BPF_PROG_TYPE_KPROBE, }; -static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + /* * r1 points to perf tracepoint buffer where first 8 bytes are hidden * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it - * from there and call the same bpf_perf_event_output() helper + * from there and call the same bpf_perf_event_output() helper inline. */ - u64 ctx = *(long *)(uintptr_t)r1; - - return bpf_perf_event_output(ctx, r2, index, r4, size); + return ____bpf_perf_event_output(regs, map, flags, data, size); } static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { @@ -504,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg5_type = ARG_CONST_STACK_SIZE, }; -static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags) { - u64 ctx = *(long *)(uintptr_t)r1; + struct pt_regs *regs = *(struct pt_regs **)tp_buff; - return bpf_get_stackid(ctx, r2, r3, r4, r5); + /* + * Same comment as in bpf_perf_event_output_tp(), only that this time + * the other helper's function body cannot be inlined due to being + * external, thus we need to call raw helper function. + */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); } static const struct bpf_func_proto bpf_get_stackid_proto_tp = { -- cgit v1.2.3 From 767ae08678c2c796bcd7f582ee457aee20a28a1e Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:49 +0300 Subject: perf/core: Fix a race between mmap_close() and set_output() of AUX events In the mmap_close() path we need to stop all the AUX events that are writing data to the AUX area that we are unmapping, before we can safely free the pages. To determine if an event needs to be stopped, we're comparing its ->rb against the one that's getting unmapped. However, a SET_OUTPUT ioctl may turn up inside an AUX transaction and swizzle event::rb to some other ring buffer, but the transaction will keep writing data to the old ring buffer until the event gets scheduled out. At this point, mmap_close() will skip over such an event and will proceed to free the AUX area, while it's still being used by this event, which will set off a warning in the mmap_close() path and cause a memory corruption. To avoid this, always stop an AUX event before its ->rb is updated; this will release the (potentially) last reference on the AUX area of the buffer. If the event gets restarted, its new ring buffer will be used. If another SET_OUTPUT comes and switches it back to the old ring buffer that's getting unmapped, it's also fine: this ring buffer's aux_mmap_count will be zero and AUX transactions won't start any more. Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 07ac8596a728..a54f2c2cdb20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2496,11 +2496,11 @@ static int __perf_event_stop(void *info) return 0; } -static int perf_event_restart(struct perf_event *event) +static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, - .restart = 1, + .restart = restart, }; int ret = 0; @@ -4845,6 +4845,19 @@ static void ring_buffer_attach(struct perf_event *event, spin_unlock_irqrestore(&rb->event_lock, flags); } + /* + * Avoid racing with perf_mmap_close(AUX): stop the event + * before swizzling the event::rb pointer; if it's getting + * unmapped, its aux_mmap_count will be 0 and it won't + * restart. See the comment in __perf_pmu_output_stop(). + * + * Data will inevitably be lost when set_output is done in + * mid-air, but then again, whoever does it like this is + * not in for the data anyway. + */ + if (has_aux(event)) + perf_event_stop(event, 0); + rcu_assign_pointer(event->rb, rb); if (old_rb) { @@ -6120,7 +6133,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } void perf_event_exec(void) @@ -6164,7 +6177,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) /* * In case of inheritance, it will be the parent that links to the - * ring-buffer, but it will be the child that's actually using it: + * ring-buffer, but it will be the child that's actually using it. + * + * We are using event::rb to determine if the event should be stopped, + * however this may race with ring_buffer_attach() (through set_output), + * which will make us skip the event that actually needs to be stopped. + * So ring_buffer_attach() has to stop an aux event before re-assigning + * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); @@ -6678,7 +6697,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } /* @@ -7867,7 +7886,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) mmput(mm); restart: - perf_event_restart(event); + perf_event_stop(event, 1); } /* -- cgit v1.2.3 From b79ccadd6bb10e72cf784a298ca6dc1398eb9a24 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:50 +0300 Subject: perf/core: Fix aux_mmap_count vs aux_refcount order The order of accesses to ring buffer's aux_mmap_count and aux_refcount has to be preserved across the users, namely perf_mmap_close() and perf_aux_output_begin(), otherwise the inversion can result in the latter holding the last reference to the aux buffer and subsequently free'ing it in atomic context, triggering a warning. > ------------[ cut here ]------------ > WARNING: CPU: 0 PID: 257 at kernel/events/ring_buffer.c:541 __rb_free_aux+0x11a/0x130 > CPU: 0 PID: 257 Comm: stopbug Not tainted 4.8.0-rc1+ #2596 > Call Trace: > [] __warn+0xcb/0xf0 > [] warn_slowpath_null+0x1d/0x20 > [] __rb_free_aux+0x11a/0x130 > [] rb_free_aux+0x18/0x20 > [] perf_aux_output_begin+0x163/0x1e0 > [] bts_event_start+0x3a/0xd0 > [] bts_event_add+0x5d/0x80 > [] event_sched_in.isra.104+0xf6/0x2f0 > [] group_sched_in+0x6e/0x190 > [] ctx_sched_in+0x2fe/0x5f0 > [] perf_event_sched_in+0x60/0x80 > [] ctx_resched+0x5b/0x90 > [] __perf_event_enable+0x1e1/0x240 > [] event_function+0xa9/0x180 > [] ? perf_cgroup_attach+0x70/0x70 > [] remote_function+0x3f/0x50 > [] flush_smp_call_function_queue+0x83/0x150 > [] generic_smp_call_function_single_interrupt+0x13/0x60 > [] smp_call_function_single_interrupt+0x27/0x40 > [] call_function_single_interrupt+0x89/0x90 > [] finish_task_switch+0xa6/0x210 > [] ? finish_task_switch+0x67/0x210 > [] __schedule+0x3dd/0xb50 > [] schedule+0x35/0x80 > [] sys_sched_yield+0x61/0x70 > [] entry_SYSCALL_64_fastpath+0x18/0xa8 > ---[ end trace 6235f556f5ea83a9 ]--- This patch puts the checks in perf_aux_output_begin() in the same order as that of perf_mmap_close(). Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-3-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ae9b90dc9a5a..257fa460b846 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!rb) return NULL; - if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + if (!rb_has_aux(rb)) goto err; /* - * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), - * the aux buffer is in perf_mmap_close(), about to get freed. + * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), + * about to get freed, so we leave immediately. + * + * Checking rb::aux_mmap_count and rb::refcount has to be done in + * the same order, see perf_mmap_close. Otherwise we end up freeing + * aux pages in this path, which is a bug, because in_atomic(). */ if (!atomic_read(&rb->aux_mmap_count)) - goto err_put; + goto err; + + if (!atomic_inc_not_zero(&rb->aux_refcount)) + goto err; /* * Nesting is not supported for AUX area, make sure nested -- cgit v1.2.3 From de58af878d9146e5decc0cdd7acabaa82881cbe4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Sep 2016 10:29:05 +0200 Subject: Revert "sched/fair: Make update_min_vruntime() more readable" There's a bug in this commit: 97a7142f157a ("sched/fair: Make update_min_vruntime() more readable") ... when !rb_leftmost && curr we fail to advance min_vruntime. So revert it. Reported-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a6820b3771e2..986c10c25176 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -464,17 +464,20 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = cfs_rq->min_vruntime; + if (cfs_rq->curr) + vruntime = cfs_rq->curr->vruntime; + if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); - vruntime = se->vruntime; + if (!cfs_rq->curr) + vruntime = se->vruntime; + else + vruntime = min_vruntime(vruntime, se->vruntime); } - if (cfs_rq->curr) - vruntime = min_vruntime(vruntime, cfs_rq->curr->vruntime); - /* ensure we never gain time by being placed backwards. */ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); #ifndef CONFIG_64BIT -- cgit v1.2.3 From f971cc9aabc287120bbe7f3f1abe70c13e61ee94 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 7 Sep 2016 12:45:09 -0400 Subject: tracing: Have max_latency be defined for HWLAT_TRACER as well The hwlat tracer uses tr->max_latency, and if it's the only tracer enabled that uses it, the build will fail. Add max_latency and its file when the hwlat tracer is enabled. Link: http://lkml.kernel.org/r/d6c3b7eb-ba95-1ffa-0453-464e1e24262a@infradead.org Reported-by: Randy Dunlap Tested-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- kernel/trace/trace.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 474cc814e16d..e0d0cfc1aa20 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4945,7 +4945,7 @@ out: return ret; } -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, @@ -5867,7 +5867,7 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -7195,7 +7195,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) trace_create_file("tracing_max_latency", 0644, d_tracer, &tr->max_latency, &tracing_max_lat_fops); #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1d866b0c1567..fd24b1f9ac43 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -214,6 +214,8 @@ struct trace_array { */ struct trace_buffer max_buffer; bool allocated_snapshot; +#endif +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; #endif struct trace_pid_list __rcu *filtered_pids; -- cgit v1.2.3 From 5b3f249c94ce1f46bacd9814385b0ee2d1ae52f3 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 19 Aug 2016 12:37:23 +0800 Subject: PM / sleep: Increase default DPM watchdog timeout to 120 Recently we have a new report that, the harddisk can not resume on time due to firmware issues, and got a kernel panic because of DPM watchdog timeout. So adjust the default timeout from 60 to 120 to survive on this platform, and make DPM_WATCHDOG depending on EXPERT. Link: https://bugzilla.kernel.org/show_bug.cgi?id=117971 Suggested-by: Pavel Machek Suggested-by: Rafael J. Wysocki Reported-by: Higuita Signed-off-by: Chen Yu Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 68d3ebc12601..e8517b63eb37 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -186,7 +186,7 @@ config PM_SLEEP_DEBUG config DPM_WATCHDOG bool "Device suspend/resume watchdog" - depends on PM_DEBUG && PSTORE + depends on PM_DEBUG && PSTORE && EXPERT ---help--- Sets up a watchdog timer to capture drivers that are locked up attempting to suspend/resume a device. @@ -197,7 +197,7 @@ config DPM_WATCHDOG config DPM_WATCHDOG_TIMEOUT int "Watchdog timeout in seconds" range 1 120 - default 60 + default 120 depends on DPM_WATCHDOG config PM_TRACE -- cgit v1.2.3 From fa7fd6fa38e36d88bc9f2d0e45e5b9bd0387079f Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 19 Aug 2016 14:41:00 +0100 Subject: PM / sleep: enable suspend-to-idle even without registered suspend_ops Suspend-to-idle (aka the "freeze" sleep state) is a system sleep state in which all of the processors enter deepest possible idle state and wait for interrupts right after suspending all the devices. There is no hard requirement for a platform to support and register platform specific suspend_ops to enter suspend-to-idle/freeze state. Only deeper system sleep states like PM_SUSPEND_STANDBY and PM_SUSPEND_MEM rely on such low level support/implementation. suspend-to-idle can be entered as along as all the devices can be suspended. This patch enables the support for suspend-to-idle even on systems that don't have any low level support for deeper system sleep states and/or don't register any platform specific suspend_ops. Signed-off-by: Sudeep Holla Tested-by: Andy Gross Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 1 + kernel/power/suspend.c | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 5ea50b1b7595..281a697fd458 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -644,6 +644,7 @@ static int __init pm_init(void) return error; hibernate_image_size_init(); hibernate_reserved_size_init(); + pm_states_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0acab9d7f96f..1e7f5da648d9 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -118,10 +118,18 @@ static bool valid_state(suspend_state_t state) */ static bool relative_states; +void __init pm_states_init(void) +{ + /* + * freeze state should be supported even without any suspend_ops, + * initialize pm_states accordingly here + */ + pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; +} + static int __init sleep_states_setup(char *str) { relative_states = !strncmp(str, "1", 1); - pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; return 1; } @@ -211,7 +219,7 @@ static int platform_suspend_begin(suspend_state_t state) { if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) return freeze_ops->begin(); - else if (suspend_ops->begin) + else if (suspend_ops && suspend_ops->begin) return suspend_ops->begin(state); else return 0; @@ -221,7 +229,7 @@ static void platform_resume_end(suspend_state_t state) { if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) freeze_ops->end(); - else if (suspend_ops->end) + else if (suspend_ops && suspend_ops->end) suspend_ops->end(); } -- cgit v1.2.3 From 1ad1410f632d4141221634308a5e56f339f92009 Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Fri, 9 Sep 2016 10:43:32 +0200 Subject: PM / Hibernate: allow hibernation with PAGE_POISONING_ZERO PAGE_POISONING_ZERO disables zeroing new pages on alloc, they are poisoned (zeroed) as they become available. In the hibernate use case, free pages will appear in the system without being cleared, left there by the loading kernel. This patch will make sure free pages are cleared on resume when PAGE_POISONING_ZERO is enabled. We free the pages just after resume because we can't do it later: going through any device resume code might allocate some memory and invalidate the free pages bitmap. Thus we don't need to disable hibernation when PAGE_POISONING_ZERO is enabled. Signed-off-by: Anisse Astier Reviewed-by: Kees Cook Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 21 +++------------------ kernel/power/power.h | 2 ++ kernel/power/snapshot.c | 22 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 33c79b6105c5..b26dbc48c75b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -306,8 +306,10 @@ static int create_image(int platform_mode) if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); - if (!in_suspend) + if (!in_suspend) { events_check_enabled = false; + clear_free_pages(); + } platform_leave(platform_mode); @@ -1189,22 +1191,6 @@ static int __init nohibernate_setup(char *str) return 1; } -static int __init page_poison_nohibernate_setup(char *str) -{ -#ifdef CONFIG_PAGE_POISONING_ZERO - /* - * The zeroing option for page poison skips the checks on alloc. - * since hibernation doesn't save free pages there's no way to - * guarantee the pages will still be zeroed. - */ - if (!strcmp(str, "on")) { - pr_info("Disabling hibernation due to page poisoning\n"); - return nohibernate_setup(str); - } -#endif - return 1; -} - __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1212,4 +1198,3 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); -__setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/power.h b/kernel/power/power.h index 242d8b827dd5..56d1d0dedf76 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -110,6 +110,8 @@ extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); +extern void clear_free_pages(void); + /** * Auxiliary structure used for reading the snapshot image data and * metadata from and writing them to the list of page backup entries diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b02228411d57..4f0f0604f1c4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1132,6 +1132,28 @@ void free_basic_memory_bitmaps(void) pr_debug("PM: Basic memory bitmaps freed\n"); } +void clear_free_pages(void) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + struct memory_bitmap *bm = free_pages_map; + unsigned long pfn; + + if (WARN_ON(!(free_pages_map))) + return; + + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (pfn_valid(pfn)) + clear_highpage(pfn_to_page(pfn)); + + pfn = memory_bm_next_pfn(bm); + } + memory_bm_position_reset(bm); + pr_info("PM: free pages cleared after restore\n"); +#endif /* PAGE_POISONING_ZERO */ +} + /** * snapshot_additional_pages - Estimate the number of extra pages needed. * @zone: Memory zone to carry out the computation for. -- cgit v1.2.3 From e2753293ac4bce8623650bb2d610b7e657bc869f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 12 Sep 2016 13:38:42 -0700 Subject: x86/pkeys: Fix pkeys build breakage for some non-x86 arches Guenter Roeck reported breakage on the h8300 and c6x architectures (among others) caused by the new memory protection keys syscalls. This patch does what Arnd suggested and adds them to kernel/sys_ni.c. Fixes: a60f7b69d92c ("generic syscalls: Wire up memory protection keys syscalls") Reported-and-tested-by: Guenter Roeck Signed-off-by: Dave Hansen Acked-by: Arnd Bergmann Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20160912203842.48E7AC50@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- kernel/sys_ni.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2c5e3a8e00d7..635482e60ca3 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -250,3 +250,8 @@ cond_syscall(sys_execveat); /* membarrier */ cond_syscall(sys_membarrier); + +/* memory protection keys */ +cond_syscall(sys_pkey_mprotect); +cond_syscall(sys_pkey_alloc); +cond_syscall(sys_pkey_free); -- cgit v1.2.3 From 28b89b9e6f7b6c8fef7b3af39828722bca20cfee Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Sun, 11 Sep 2016 21:14:58 -0700 Subject: cpuset: handle race between CPU hotplug and cpuset_hotplug_work A discrepancy between cpu_online_mask and cpuset's effective_cpus mask is inevitable during hotplug since cpuset defers updating of effective_cpus mask using a workqueue, during which time nothing prevents the system from more hotplug operations. For that reason guarantee_online_cpus() walks up the cpuset hierarchy until it finds an intersection under the assumption that top cpuset's effective_cpus mask intersects with cpu_online_mask even with such a race occurring. However a sequence of CPU hotplugs can open a time window, during which none of the effective CPUs in the top cpuset intersect with cpu_online_mask. For example when there are 4 possible CPUs 0-3 and only CPU0 is online: ======================== =========================== cpu_online_mask top_cpuset.effective_cpus ======================== =========================== echo 1 > cpu2/online. CPU hotplug notifier woke up hotplug work but not yet scheduled. [0,2] [0] echo 0 > cpu0/online. The workqueue is still runnable. [2] [0] ======================== =========================== Now there is no intersection between cpu_online_mask and top_cpuset.effective_cpus. Thus invoking sys_sched_setaffinity() at this moment can cause following: Unable to handle kernel NULL pointer dereference at virtual address 000000d0 ------------[ cut here ]------------ Kernel BUG at ffffffc0001389b0 [verbose debug info unavailable] Internal error: Oops - BUG: 96000005 [#1] PREEMPT SMP Modules linked in: CPU: 2 PID: 1420 Comm: taskset Tainted: G W 4.4.8+ #98 task: ffffffc06a5c4880 ti: ffffffc06e124000 task.ti: ffffffc06e124000 PC is at guarantee_online_cpus+0x2c/0x58 LR is at cpuset_cpus_allowed+0x4c/0x6c Process taskset (pid: 1420, stack limit = 0xffffffc06e124020) Call trace: [] guarantee_online_cpus+0x2c/0x58 [] cpuset_cpus_allowed+0x4c/0x6c [] sched_setaffinity+0xc0/0x1ac [] SyS_sched_setaffinity+0x98/0xac [] el0_svc_naked+0x24/0x28 The top cpuset's effective_cpus are guaranteed to be identical to cpu_online_mask eventually. Hence fall back to cpu_online_mask when there is no intersection between top cpuset's effective_cpus and cpu_online_mask. Signed-off-by: Joonwoo Park Acked-by: Li Zefan Cc: Tejun Heo Cc: cgroups@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: # 3.17+ Signed-off-by: Tejun Heo --- kernel/cpuset.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c27e53326bef..c08585ad996b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. The top - * cpuset always has some cpus online. + * until we find one that does have some online cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. @@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { cs = parent_cs(cs); + if (unlikely(!cs)) { + /* + * The top cpuset doesn't have any online cpu as a + * consequence of a race between cpuset_hotplug_work + * and cpu hotplug notifier. But we know the top + * cpuset's effective_cpus is on its way to to be + * identical to cpu_online_mask. + */ + cpumask_copy(pmask, cpu_online_mask); + return; + } + } cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } -- cgit v1.2.3 From 57ccdf449f962ab5fc8cbf26479402f13bdb8be7 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 7 Sep 2016 18:51:13 +0800 Subject: tick/nohz: Prevent stopping the tick on an offline CPU can_stop_full_tick() has no check for offline cpus. So it allows to stop the tick on an offline cpu from the interrupt return path, which is wrong and subsequently makes irq_work_needs_cpu() warn about being called for an offline cpu. Commit f7ea0fd639c2c4 ("tick: Don't invoke tick_nohz_stop_sched_tick() if the cpu is offline") added prevention for can_stop_idle_tick(), but forgot to do the same in can_stop_full_tick(). Add it. [ tglx: Massaged changelog ] Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1473245473-4463-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2ec7c00228f3..3bcb61b52f6c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep) return false; } -static bool can_stop_full_tick(struct tick_sched *ts) +static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); + if (unlikely(!cpu_online(cpu))) + return false; + if (check_tick_dependency(&tick_dep_mask)) return false; @@ -843,7 +846,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick(ts)) + if (can_stop_full_tick(cpu, ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); -- cgit v1.2.3 From ecb3f394c5dba897d215a5422f1b363e93e2ce4e Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 13 Sep 2016 12:14:51 -0400 Subject: genirq: Expose interrupt information through sysfs Information about interrupts is exposed via /proc/interrupts, but the format of that file has changed over kernel versions and differs across architectures. It also has varying column numbers depending on hardware. That all makes it hard for tools to parse. To solve this, expose the information through sysfs so each irq attribute is in a separate file in a consistent, machine parsable way. This feature is only available when both CONFIG_SPARSE_IRQ and CONFIG_SYSFS are enabled. Examples: /sys/kernel/irq/18/actions: i801_smbus,ehci_hcd:usb1,uhci_hcd:usb7 /sys/kernel/irq/18/chip_name: IR-IO-APIC /sys/kernel/irq/18/hwirq: 18 /sys/kernel/irq/18/name: fasteoi /sys/kernel/irq/18/per_cpu_count: 0,0 /sys/kernel/irq/18/type: level /sys/kernel/irq/25/actions: ahci0 /sys/kernel/irq/25/chip_name: IR-PCI-MSI /sys/kernel/irq/25/hwirq: 512000 /sys/kernel/irq/25/name: edge /sys/kernel/irq/25/per_cpu_count: 29036,0 /sys/kernel/irq/25/type: edge [ tglx: Moved kobject_del() under sparse_irq_lock, massaged code comments and changelog ] Signed-off-by: Craig Gallek Cc: David Decotigny Link: http://lkml.kernel.org/r/1473783291-122873-1-git-send-email-kraigatgoog@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 191 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a623b44f2d4b..93b51727abaa 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "internals.h" @@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); #ifdef CONFIG_SPARSE_IRQ +static void irq_kobj_release(struct kobject *kobj); + +#ifdef CONFIG_SYSFS +static struct kobject *irq_kobj_base; + +#define IRQ_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +static ssize_t per_cpu_count_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + int cpu, irq = desc->irq_data.irq; + ssize_t ret = 0; + char *p = ""; + + for_each_possible_cpu(cpu) { + unsigned int c = kstat_irqs_cpu(irq, cpu); + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); + p = ","; + } + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +IRQ_ATTR_RO(per_cpu_count); + +static ssize_t chip_name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.chip && desc->irq_data.chip->name) { + ret = scnprintf(buf, PAGE_SIZE, "%s\n", + desc->irq_data.chip->name); + } + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(chip_name); + +static ssize_t hwirq_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.domain) + ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(hwirq); + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + ret = sprintf(buf, "%s\n", + irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); + raw_spin_unlock_irq(&desc->lock); + + return ret; + +} +IRQ_ATTR_RO(type); + +static ssize_t name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->name) + ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(name); + +static ssize_t actions_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + struct irqaction *action; + ssize_t ret = 0; + char *p = ""; + + raw_spin_lock_irq(&desc->lock); + for (action = desc->action; action != NULL; action = action->next) { + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + p, action->name); + p = ","; + } + raw_spin_unlock_irq(&desc->lock); + + if (ret) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + + return ret; +} +IRQ_ATTR_RO(actions); + +static struct attribute *irq_attrs[] = { + &per_cpu_count_attr.attr, + &chip_name_attr.attr, + &hwirq_attr.attr, + &type_attr.attr, + &name_attr.attr, + &actions_attr.attr, + NULL +}; + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = irq_attrs, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) +{ + if (irq_kobj_base) { + /* + * Continue even in case of failure as this is nothing + * crucial. + */ + if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) + pr_warn("Failed to add kobject for irq %d\n", irq); + } +} + +static int __init irq_sysfs_init(void) +{ + struct irq_desc *desc; + int irq; + + /* Prevent concurrent irq alloc/free */ + irq_lock_sparse(); + + irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); + if (!irq_kobj_base) { + irq_unlock_sparse(); + return -ENOMEM; + } + + /* Add the already allocated interrupts */ + for_each_irq_desc(irq, desc) + irq_sysfs_add(irq, desc); + irq_unlock_sparse(); + + return 0; +} +postcore_initcall(irq_sysfs_init); + +#else /* !CONFIG_SYSFS */ + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) {} + +#endif /* CONFIG_SYSFS */ + static RADIX_TREE(irq_desc_tree, GFP_KERNEL); static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) @@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); + kobject_init(&desc->kobj, &irq_kobj_type); return desc; @@ -197,15 +374,22 @@ err_desc: return NULL; } -static void delayed_free_desc(struct rcu_head *rhp) +static void irq_kobj_release(struct kobject *kobj) { - struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } +static void delayed_free_desc(struct rcu_head *rhp) +{ + struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + + kobject_put(&desc->kobj); +} + static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -217,8 +401,12 @@ static void free_desc(unsigned int irq) * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. + * + * The sysfs entry must be serialized against a concurrent + * irq_sysfs_init() as well. */ mutex_lock(&sparse_irq_lock); + kobject_del(&desc->kobj); delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); @@ -261,6 +449,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, goto err; mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); + irq_sysfs_add(start + i, desc); mutex_unlock(&sparse_irq_lock); } return start; -- cgit v1.2.3 From 6846351052e685c2d1428e80ead2d7ca3d7ed913 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 5 Sep 2016 16:33:08 +0300 Subject: x86/signal: Add SA_{X32,IA32}_ABI sa_flags Introduce new flags that defines which ABI to use on creating sigframe. Those flags kernel will set according to sigaction syscall ABI, which set handler for the signal being delivered. So that will drop the dependency on TIF_IA32/TIF_X32 flags on signal deliver. Those flags will be used only under CONFIG_COMPAT. Similar way ARM uses sa_flags to differ in which mode deliver signal for 26-bit applications (look at SA_THIRYTWO). Signed-off-by: Dmitry Safonov Reviewed-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: oleg@redhat.com Cc: linux-mm@kvack.org Cc: gorcunov@openvz.org Cc: xemul@virtuozzo.com Link: http://lkml.kernel.org/r/20160905133308.28234-7-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- kernel/signal.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index af21afc00d08..75761acc77cf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3044,6 +3044,11 @@ void kernel_sigaction(int sig, __sighandler_t action) } EXPORT_SYMBOL(kernel_sigaction); +void __weak sigaction_compat_abi(struct k_sigaction *act, + struct k_sigaction *oact) +{ +} + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; @@ -3059,6 +3064,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (oact) *oact = *k; + sigaction_compat_abi(act, oact); + if (act) { sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); -- cgit v1.2.3 From 28f4b04143c56135b1ca742fc64b664ed04de6a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:47 +0200 Subject: genirq/msi: Add cpumask allocation to alloc_msi_entry For irq spreading want to store affinity masks in the msi_entry. Add the infrastructure for it. We allocate an array of cpumasks with an array size of the number of used vectors in the entry, so we can hand in the information per linux interrupt later. As we hand in the number of used vectors, we assign them right away. Convert all the call sites. Signed-off-by: Thomas Gleixner Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/1473862739-15032-2-git-send-email-hch@lst.de --- kernel/irq/msi.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 19e9dfbe97fa..8a3e872798f3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -18,20 +18,42 @@ /* Temparory solution for building, will be removed later */ #include -struct msi_desc *alloc_msi_entry(struct device *dev) +/** + * alloc_msi_entry - Allocate an initialize msi_entry + * @dev: Pointer to the device for which this is allocated + * @nvec: The number of vectors used in this entry + * @affinity: Optional pointer to an affinity mask array size of @nvec + * + * If @affinity is not NULL then a an affinity array[@nvec] is allocated + * and the affinity masks from @affinity are copied. + */ +struct msi_desc * +alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity) { - struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + struct msi_desc *desc; + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) return NULL; INIT_LIST_HEAD(&desc->list); desc->dev = dev; + desc->nvec_used = nvec; + if (affinity) { + desc->affinity = kmemdup(affinity, + nvec * sizeof(*desc->affinity), GFP_KERNEL); + if (!desc->affinity) { + kfree(desc); + return NULL; + } + } return desc; } void free_msi_entry(struct msi_desc *entry) { + kfree(entry->affinity); kfree(entry); } -- cgit v1.2.3 From 34c3d9819fda464be4f1bec59b63353814f76c73 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:48 +0200 Subject: genirq/affinity: Provide smarter irq spreading infrastructure The current irq spreading infrastructure is just looking at a cpumask and tries to spread the interrupts over the mask. Thats suboptimal as it does not take numa nodes into account. Change the logic so the interrupts are spread across numa nodes and inside the nodes. If there are more cpus than vectors per node, then we set the affinity to several cpus. If HT siblings are available we take that into account and try to set all siblings to a single vector. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-3-git-send-email-hch@lst.de --- kernel/irq/affinity.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 32f6cfcff212..7812fecc6e2f 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -4,6 +4,155 @@ #include #include +static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + int cpus_per_vec) +{ + const struct cpumask *siblmsk; + int cpu, sibl; + + for ( ; cpus_per_vec > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >= nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_vec--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); + for (sibl = -1; cpus_per_vec > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_vec--; + } + } +} + +static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes; + + /* Calculate the number of nodes in the supplied affinity mask */ + for (n = 0, nodes = 0; n < num_online_nodes(); n++) { + if (cpumask_intersects(mask, cpumask_of_node(n))) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; +} + +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @affinity: The affinity mask to spread. If NULL cpu_online_mask + * is used + * @nvecs: The number of vectors + * + * Returns the masks pointer or NULL if allocation failed. + */ +struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, + int nvec) +{ + int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0; + nodemask_t nodemsk = NODE_MASK_NONE; + struct cpumask *masks; + cpumask_var_t nmsk; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL); + if (!masks) + goto out; + + /* Stabilize the cpumasks */ + get_online_cpus(); + /* If the supplied affinity mask is NULL, use cpu online mask */ + if (!affinity) + affinity = cpu_online_mask; + + nodes = get_nodes_in_cpumask(affinity, &nodemsk); + + /* + * If the number of nodes in the mask is less than or equal the + * number of vectors we just spread the vectors across the nodes. + */ + if (nvec <= nodes) { + for_each_node_mask(n, nodemsk) { + cpumask_copy(masks + curvec, cpumask_of_node(n)); + if (++curvec == nvec) + break; + } + goto outonl; + } + + /* Spread the vectors per node */ + vecs_per_node = nvec / nodes; + /* Account for rounding errors */ + extra_vecs = nvec - (nodes * vecs_per_node); + + for_each_node_mask(n, nodemsk) { + int ncpus, v, vecs_to_assign = vecs_per_node; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, affinity, cpumask_of_node(n)); + + /* Calculate the number of cpus per vector */ + ncpus = cpumask_weight(nmsk); + + for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) { + cpus_per_vec = ncpus / vecs_to_assign; + + /* Account for extra vectors to compensate rounding errors */ + if (extra_vecs) { + cpus_per_vec++; + if (!--extra_vecs) + vecs_per_node++; + } + irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); + } + + if (curvec >= nvec) + break; + } + +outonl: + put_online_cpus(); +out: + free_cpumask_var(nmsk); + return masks; +} + +/** + * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask + * @affinity: The affinity mask to spread. If NULL cpu_online_mask + * is used + * @maxvec: The maximum number of vectors available + */ +int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) +{ + int cpus, ret; + + /* Stabilize the cpumasks */ + get_online_cpus(); + /* If the supplied affinity mask is NULL, use cpu online mask */ + if (!affinity) + affinity = cpu_online_mask; + + cpus = cpumask_weight(affinity); + ret = (cpus < maxvec) ? cpus : maxvec; + + put_online_cpus(); + return ret; +} + static int get_first_sibling(unsigned int cpu) { unsigned int ret; -- cgit v1.2.3 From e75eafb9b0395c338230b0eef2cc92ca8d20dee2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:49 +0200 Subject: genirq/msi: Switch to new irq spreading infrastructure Switch MSI over to the new spreading code. If a pci device contains a valid pointer to a cpumask, then this mask is used for spreading otherwise the online cpu mask is used. This allows a driver to restrict the spread to a subset of CPUs, e.g. cpus on a particular node. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-4-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a623b44f2d4b..5a5a685aba33 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -236,25 +236,24 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct cpumask *mask = NULL; struct irq_desc *desc; unsigned int flags; - int i, cpu = -1; + int i; - if (affinity && cpumask_empty(affinity)) - return -EINVAL; + /* Validate affinity mask(s) */ + if (affinity) { + for (i = 0, mask = affinity; i < cnt; i++, mask++) { + if (cpumask_empty(mask)) + return -EINVAL; + } + } flags = affinity ? IRQD_AFFINITY_MANAGED : 0; + mask = NULL; for (i = 0; i < cnt; i++) { if (affinity) { - cpu = cpumask_next(cpu, affinity); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(affinity); - node = cpu_to_node(cpu); - - /* - * For single allocations we use the caller provided - * mask otherwise we use the mask of the target cpu - */ - mask = cnt == 1 ? affinity : cpumask_of(cpu); + node = cpu_to_node(cpumask_first(affinity)); + mask = affinity; + affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) @@ -481,9 +480,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) - * @affinity: Optional pointer to an affinity mask which hints where the - * irq descriptors should be allocated and which default - * affinities to use + * @affinity: Optional pointer to an affinity mask array of size @cnt which + * hints where the irq descriptors should be allocated and which + * default affinities to use * * Returns the first irq number or error code */ -- cgit v1.2.3 From 44082fd6702fb12020967fd375f8bf6dd7c111bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:50 +0200 Subject: genirq/affinity: Remove old irq spread infrastructure No more users. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-5-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- kernel/irq/affinity.c | 58 --------------------------------------------------- 1 file changed, 58 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 7812fecc6e2f..17f51d63da56 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -152,61 +152,3 @@ int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) put_online_cpus(); return ret; } - -static int get_first_sibling(unsigned int cpu) -{ - unsigned int ret; - - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - return cpu; -} - -/* - * Take a map of online CPUs and the number of available interrupt vectors - * and generate an output cpumask suitable for spreading MSI/MSI-X vectors - * so that they are distributed as good as possible around the CPUs. If - * more vectors than CPUs are available we'll map one to each CPU, - * otherwise we map one to the first sibling of each socket. - * - * If there are more vectors than CPUs we will still only have one bit - * set per CPU, but interrupt code will keep on assigning the vectors from - * the start of the bitmap until we run out of vectors. - */ -struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) -{ - struct cpumask *affinity_mask; - unsigned int max_vecs = *nr_vecs; - - if (max_vecs == 1) - return NULL; - - affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!affinity_mask) { - *nr_vecs = 1; - return NULL; - } - - get_online_cpus(); - if (max_vecs >= num_online_cpus()) { - cpumask_copy(affinity_mask, cpu_online_mask); - *nr_vecs = num_online_cpus(); - } else { - unsigned int vecs = 0, cpu; - - for_each_online_cpu(cpu) { - if (cpu == get_first_sibling(cpu)) { - cpumask_set_cpu(cpu, affinity_mask); - vecs++; - } - - if (--max_vecs == 0) - break; - } - *nr_vecs = vecs; - } - put_online_cpus(); - - return affinity_mask; -} -- cgit v1.2.3 From c65eacbe290b8141554c71b2c94489e73ade8c8d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Sep 2016 14:29:24 -0700 Subject: sched/core: Allow putting thread_info into task_struct If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT, then thread_info is defined as a single 'u32 flags' and is the first entry of task_struct. thread_info::task is removed (it serves no purpose if thread_info is embedded in task_struct), and thread_info::cpu gets its own slot in task_struct. This is heavily based on a patch written by Linus. Originally-from: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a0898196f0476195ca02713691a5037a14f2aac5.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..3655c9625e5b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1000,7 +1000,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + p->cpu = cpu; +#else task_thread_info(p)->cpu = cpu; +#endif p->wake_cpu = cpu; #endif } -- cgit v1.2.3 From 23196f2e5f5d810578a772785807dcdc2b9fdce9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 15 Sep 2016 22:45:44 -0700 Subject: kthread: Pin the stack via try_get_task_stack()/put_task_stack() in to_live_kthread() function get_task_struct(tsk) no longer pins tsk->stack so all users of to_live_kthread() should do try_get_task_stack/put_task_stack to protect "struct kthread" which lives on kthread's stack. TODO: Kill to_live_kthread(), perhaps we can even kill "struct kthread" too, and rework kthread_stop(), it can use task_work_add() to sync with the exiting kernel thread. Message-Id: <20160629180357.GA7178@redhat.com> Signed-off-by: Oleg Nesterov Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cb9b16bbc19d4aea4507ab0552e4644c1211d130.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/kthread.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9ff173dca1ae..4ab4c3766a80 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) static struct kthread *to_live_kthread(struct task_struct *k) { struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork)) + if (likely(vfork) && try_get_task_stack(k)) return __to_kthread(vfork); return NULL; } @@ -425,8 +425,10 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_live_kthread(k); - if (kthread) + if (kthread) { __kthread_unpark(k, kthread); + put_task_stack(k); + } } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -455,6 +457,7 @@ int kthread_park(struct task_struct *k) wait_for_completion(&kthread->parked); } } + put_task_stack(k); ret = 0; } return ret; @@ -490,6 +493,7 @@ int kthread_stop(struct task_struct *k) __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); + put_task_stack(k); } ret = k->exit_code; put_task_struct(k); -- cgit v1.2.3 From 68f24b08ee892d47bdef925d676e1ae1ccc316f8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:48 -0700 Subject: sched/core: Free the stack early if CONFIG_THREAD_INFO_IN_TASK We currently keep every task's stack around until the task_struct itself is freed. This means that we keep the stack allocation alive for longer than necessary and that, under load, we free stacks in big batches whenever RCU drops the last task reference. Neither of these is good for reuse of cache-hot memory, and freeing in batches prevents us from usefully caching small numbers of vmalloced stacks. On architectures that have thread_info on the stack, we can't easily change this, but on architectures that set THREAD_INFO_IN_TASK, we can free it as soon as the task is dead. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/08ca06cde00ebed0046c5d26cbbf3fbb7ef5b812.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 35 ++++++++++++++++++++++++++++++++++- kernel/sched/core.c | 4 ++++ 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0c240fd5beba..5dd0a516626d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -269,11 +269,40 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } -void free_task(struct task_struct *tsk) +static void release_task_stack(struct task_struct *tsk) { account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); + tsk->stack = NULL; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = NULL; +#endif +} + +#ifdef CONFIG_THREAD_INFO_IN_TASK +void put_task_stack(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->stack_refcount)) + release_task_stack(tsk); +} +#endif + +void free_task(struct task_struct *tsk) +{ +#ifndef CONFIG_THREAD_INFO_IN_TASK + /* + * The task is finally done with both the stack and thread_info, + * so free both. + */ + release_task_stack(tsk); +#else + /* + * If the task had a separate stack allocation, it should be gone + * by now. + */ + WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); +#endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -411,6 +440,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_VMAP_STACK tsk->stack_vm_area = stack_vm_area; #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + atomic_set(&tsk->stack_refcount, 1); +#endif if (err) goto free_stack; @@ -1771,6 +1803,7 @@ bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + put_task_stack(p); free_task(p); fork_out: return ERR_PTR(retval); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b6238f18da2..23c6037e2d89 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2772,6 +2772,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) * task and put them back on the free list. */ kprobe_flush_task(prev); + + /* Task is done with its stack. */ + put_task_stack(prev); + put_task_struct(prev); } -- cgit v1.2.3 From ac496bf48d97f2503eaa353996a4dd5e4383eaf0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:49 -0700 Subject: fork: Optimize task creation by caching two thread stacks per CPU if CONFIG_VMAP_STACK=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vmalloc() is a bit slow, and pounding vmalloc()/vfree() will eventually force a global TLB flush. To reduce pressure on them, if CONFIG_VMAP_STACK=y, cache two thread stacks per CPU. This will let us quickly allocate a hopefully cache-hot, TLB-hot stack under heavy forking workloads (shell script style). On my silly pthread_create() benchmark, it saves about 2 µs per pthread_create()+join() with CONFIG_VMAP_STACK=y. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/94811d8e3994b2e962f88866290017d498eb069c.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5dd0a516626d..c060c7e7c247 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -159,15 +159,41 @@ void __weak arch_release_thread_stack(unsigned long *stack) * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) + +#ifdef CONFIG_VMAP_STACK +/* + * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB + * flush. Try to minimize the number of calls by caching stacks. + */ +#define NR_CACHED_STACKS 2 +static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +#endif + static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { #ifdef CONFIG_VMAP_STACK - void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP | __GFP_HIGHMEM, - PAGE_KERNEL, - 0, node, - __builtin_return_address(0)); + void *stack; + int i; + + local_irq_disable(); + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *s = this_cpu_read(cached_stacks[i]); + + if (!s) + continue; + this_cpu_write(cached_stacks[i], NULL); + + tsk->stack_vm_area = s; + local_irq_enable(); + return s->addr; + } + local_irq_enable(); + + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP | __GFP_HIGHMEM, + PAGE_KERNEL, + 0, node, __builtin_return_address(0)); /* * We can't call find_vm_area() in interrupt context, and @@ -187,10 +213,28 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) static inline void free_thread_stack(struct task_struct *tsk) { - if (task_stack_vm_area(tsk)) +#ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { + unsigned long flags; + int i; + + local_irq_save(flags); + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_read(cached_stacks[i])) + continue; + + this_cpu_write(cached_stacks[i], tsk->stack_vm_area); + local_irq_restore(flags); + return; + } + local_irq_restore(flags); + vfree(tsk->stack); - else - __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); + return; + } +#endif + + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; -- cgit v1.2.3 From 8a15b81741879fa89601b03c0e50b0d780d65bc0 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 16 Sep 2016 13:02:37 +0000 Subject: cpuset: fix non static symbol warning Fixes the following sparse warning: kernel/cpuset.c:2088:6: warning: symbol 'cpuset_fork' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Tejun Heo --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c08585ad996b..2b4c20ab5bbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2085,7 +2085,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ -void cpuset_fork(struct task_struct *task) +static void cpuset_fork(struct task_struct *task) { if (task_css_is_root(task, cpuset_cgrp_id)) return; -- cgit v1.2.3 From 1984e075915cbae65336a99b1879865080d8e55e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 19 Sep 2016 09:49:27 +0100 Subject: genirq: Skip chained interrupt trigger setup if type is IRQ_TYPE_NONE There is no point in trying to configure the trigger of a chained interrupt if no trigger information has been configured. At best this is ignored, and at the worse this confuses the underlying irqchip (which is likely not to handle such a thing), and unnecessarily alarms the user. Only apply the configuration if type is not IRQ_TYPE_NONE. Fixes: 1e12c4a9393b ("genirq: Correctly configure the trigger on chained interrupts") Reported-and-tested-by: Geert Uytterhoeven Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/CAMuHMdVW1eTn20=EtYcJ8hkVwohaSuH_yQXrY2MGBEvZ8fpFOg@mail.gmail.com Link: http://lkml.kernel.org/r/1474274967-15984-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 637389088b3f..26ba5654d9d5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -820,6 +820,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, desc->name = name; if (handle != handle_bad_irq && is_chained) { + unsigned int type = irqd_get_trigger_type(&desc->irq_data); + /* * We're about to start this interrupt immediately, * hence the need to set the trigger configuration. @@ -828,8 +830,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, * chained interrupt. Reset it immediately because we * do know better. */ - __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data)); - desc->handle_irq = handle; + if (type != IRQ_TYPE_NONE) { + __irq_set_trigger(desc, type); + desc->handle_irq = handle; + } irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); -- cgit v1.2.3 From 30e92153b4e6f1cd01e30c34d9ef6f0986f96b0e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 6 Sep 2016 19:04:49 +0200 Subject: padata: Convert to hotplug state machine Install the callbacks via the state machine. CPU-hotplug multinstance support is used with the nocalls() version. Maybe parts of padata_alloc() could be moved into the online callback so that we could invoke ->startup callback for instance and drop get_online_cpus(). Signed-off-by: Sebastian Andrzej Siewior Cc: Steffen Klassert Cc: Peter Zijlstra Cc: linux-crypto@vger.kernel.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160906170457.32393-14-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/padata.c | 88 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 993278895ccc..7848f0566403 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -30,6 +30,7 @@ #include #include #include +#include #define MAX_OBJ_NUM 1000 @@ -769,52 +770,43 @@ static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); } - -static int padata_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) { - int err; struct padata_instance *pinst; - int cpu = (unsigned long)hcpu; + int ret; - pinst = container_of(nfb, struct padata_instance, cpu_notifier); + pinst = hlist_entry_safe(node, struct padata_instance, node); + if (!pinst_has_cpu(pinst, cpu)) + return 0; - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_add_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; + mutex_lock(&pinst->lock); + ret = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + return ret; +} - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_remove_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; - } +static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node) +{ + struct padata_instance *pinst; + int ret; + + pinst = hlist_entry_safe(node, struct padata_instance, node); + if (!pinst_has_cpu(pinst, cpu)) + return 0; - return NOTIFY_OK; + mutex_lock(&pinst->lock); + ret = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + return ret; } + +static enum cpuhp_state hp_online; #endif static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - unregister_hotcpu_notifier(&pinst->cpu_notifier); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); #endif padata_stop(pinst); @@ -1012,11 +1004,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - pinst->cpu_notifier.notifier_call = padata_cpu_callback; - pinst->cpu_notifier.priority = 0; - register_hotcpu_notifier(&pinst->cpu_notifier); + cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); #endif - return pinst; err_free_masks: @@ -1039,3 +1028,26 @@ void padata_free(struct padata_instance *pinst) kobject_put(&pinst->kobj); } EXPORT_SYMBOL(padata_free); + +#ifdef CONFIG_HOTPLUG_CPU + +static __init int padata_driver_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", + padata_cpu_online, + padata_cpu_prep_down); + if (ret < 0) + return ret; + hp_online = ret; + return 0; +} +module_init(padata_driver_init); + +static __exit void padata_driver_exit(void) +{ + cpuhp_remove_multi_state(hp_online); +} +module_exit(padata_driver_exit); +#endif -- cgit v1.2.3 From d979a39d7242e0601bf9b60e89628fb8ac577179 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 19 Sep 2016 14:44:38 -0700 Subject: cgroup: duplicate cgroup reference when cloning sockets When a socket is cloned, the associated sock_cgroup_data is duplicated but not its reference on the cgroup. As a result, the cgroup reference count will underflow when both sockets are destroyed later on. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Link: http://lkml.kernel.org/r/20160914194846.11153-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: Michal Hocko Cc: Vladimir Davydov Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..5e8dab5bf9ad 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6270,6 +6270,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { -- cgit v1.2.3 From b399cf64e318ac8c5f10d36bb911e61c746b8788 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Sep 2016 00:26:12 +0200 Subject: bpf, verifier: enforce larger zero range for pkt on overloading stack buffs Current contract for the following two helper argument types is: * ARG_CONST_STACK_SIZE: passed argument pair must be (ptr, >0). * ARG_CONST_STACK_SIZE_OR_ZERO: passed argument pair can be either (NULL, 0) or (ptr, >0). With 6841de8b0d03 ("bpf: allow helpers access the packet directly"), we can pass also raw packet data to helpers, so depending on the argument type being PTR_TO_PACKET, we now either assert memory via check_packet_access() or check_stack_boundary(). As a result, the tests in check_packet_access() currently allow more than intended with regards to reg->imm. Back in 969bf05eb3ce ("bpf: direct packet access"), check_packet_access() was fine to ignore size argument since in check_mem_access() size was bpf_size_to_bytes() derived and prior to the call to check_packet_access() guaranteed to be larger than zero. However, for the above two argument types, it currently means, we can have a <= 0 size and thus breaking current guarantees for helpers. Enforce a check for size <= 0 and bail out if so. check_stack_boundary() doesn't have such an issue since it already tests for access_size <= 0 and bails out, resp. access_size == 0 in case of NULL pointer passed when allowed. Fixes: 6841de8b0d03 ("bpf: allow helpers access the packet directly") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 90493a66dddd..bc138f34e38c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -671,7 +671,7 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, struct reg_state *reg = ®s[regno]; off += reg->off; - if (off < 0 || off + size > reg->range) { + if (off < 0 || size <= 0 || off + size > reg->range) { verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; -- cgit v1.2.3 From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Sep 2016 00:26:13 +0200 Subject: bpf: direct packet write and access for helpers for clsact progs This work implements direct packet access for helpers and direct packet write in a similar fashion as already available for XDP types via commits 4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and 6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a complementary feature to the already available direct packet read for tc (cls/act) programs. For enabling this, we need to introduce two helpers, bpf_skb_pull_data() and bpf_csum_update(). The first is generally needed for both, read and write, because they would otherwise only be limited to the current linear skb head. Usually, when the data_end test fails, programs just bail out, or, in the direct read case, use bpf_skb_load_bytes() as an alternative to overcome this limitation. If such data sits in non-linear parts, we can just pull them in once with the new helper, retest and eventually access them. At the same time, this also makes sure the skb is uncloned, which is, of course, a necessary condition for direct write. As this needs to be an invariant for the write part only, the verifier detects writes and adds a prologue that is calling bpf_skb_pull_data() to effectively unclone the skb from the very beginning in case it is indeed cloned. The heuristic makes use of a similar trick that was done in 233577a22089 ("net: filter: constify detection of pkt_type_offset"). This comes at zero cost for other programs that do not use the direct write feature. Should a program use this feature only sparsely and has read access for the most parts with, for example, drop return codes, then such write action can be delegated to a tail called program for mitigating this cost of potential uncloning to a late point in time where it would have been paid similarly with the bpf_skb_store_bytes() as well. Advantage of direct write is that the writes are inlined whereas the helper cannot make any length assumptions and thus needs to generate a call to memcpy() also for small sizes, as well as cost of helper call itself with sanity checks are avoided. Plus, when direct read is already used, we don't need to cache or perform rechecks on the data boundaries (due to verifier invalidating previous checks for helpers that change skb->data), so more complex programs using rewrites can benefit from switching to direct read plus write. For direct packet access to helpers, we save the otherwise needed copy into a temp struct sitting on stack memory when use-case allows. Both facilities are enabled via may_access_direct_pkt_data() in verifier. For now, we limit this to map helpers and csum_diff, and can successively enable other helpers where we find it makes sense. Helpers that definitely cannot be allowed for this are those part of bpf_helper_changes_skb_data() since they can change underlying data, and those that write into memory as this could happen for packet typed args when still cloned. bpf_csum_update() helper accommodates for the fact that we need to fixup checksum_complete when using direct write instead of bpf_skb_store_bytes(), meaning the programs can use available helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(), csum_block_add(), csum_block_sub() equivalents in eBPF together with the new helper. A usage example will be provided for iproute2's examples/bpf/ directory. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/helpers.c | 3 +++ kernel/bpf/verifier.c | 54 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 43 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a5b8bf8cfcfd..39918402e6e9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -36,6 +36,7 @@ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -51,6 +52,7 @@ BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -67,6 +69,7 @@ BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bc138f34e38c..3a75ee3bdcd1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -196,6 +196,7 @@ struct verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool seen_direct_write; }; #define BPF_COMPLEXITY_LIMIT_INSNS 65536 @@ -204,6 +205,7 @@ struct verifier_env { struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; + bool pkt_access; int regno; int access_size; }; @@ -654,10 +656,17 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff -static bool may_write_pkt_data(enum bpf_prog_type type) +static bool may_access_direct_pkt_data(struct verifier_env *env, + const struct bpf_call_arg_meta *meta) { - switch (type) { + switch (env->prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + if (meta) + return meta->pkt_access; + + env->seen_direct_write = true; return true; default: return false; @@ -817,7 +826,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -950,8 +959,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && !may_write_pkt_data(env->prog->type)) { - verbose("helper access to the packet is not allowed for clsact\n"); + if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { + verbose("helper access to the packet is not allowed\n"); return -EACCES; } @@ -1191,6 +1200,7 @@ static int check_call(struct verifier_env *env, int func_id) changes_data = bpf_helper_changes_skb_data(fn->func); memset(&meta, 0, sizeof(meta)); + meta.pkt_access = fn->pkt_access; /* We only support one arg being in raw mode at the moment, which * is sufficient for the helper functions we have right now. @@ -2675,18 +2685,35 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) */ static int convert_ctx_accesses(struct verifier_env *env) { - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - struct bpf_insn insn_buf[16]; + const struct bpf_verifier_ops *ops = env->prog->aux->ops; + struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i; + int i, insn_cnt, cnt; - if (!env->prog->aux->ops->convert_ctx_access) + if (ops->gen_prologue) { + cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, + env->prog); + if (cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } else if (cnt) { + new_prog = bpf_patch_insn_single(env->prog, 0, + insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + } + } + + if (!ops->convert_ctx_access) return 0; + insn_cnt = env->prog->len; + insn = env->prog->insnsi; + for (i = 0; i < insn_cnt; i++, insn++) { - u32 insn_delta, cnt; + u32 insn_delta; if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) @@ -2703,9 +2730,8 @@ static int convert_ctx_accesses(struct verifier_env *env) continue; } - cnt = env->prog->aux->ops-> - convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); + cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, + insn->off, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; -- cgit v1.2.3 From 3df126f35f88dc76eea33769f85a3c3bb8ce6c6b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:56 +0100 Subject: bpf: don't (ab)use instructions to store state Storing state in reserved fields of instructions makes it impossible to run verifier on programs already marked as read-only. Allocate and use an array of per-instruction state instead. While touching the error path rename and move existing jump target. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 70 +++++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3a75ee3bdcd1..a9542d89f293 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -181,6 +181,10 @@ struct verifier_stack_elem { struct verifier_stack_elem *next; }; +struct bpf_insn_aux_data { + enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ +}; + #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ /* single container for all structs @@ -197,6 +201,7 @@ struct verifier_env { u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; bool seen_direct_write; + struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ }; #define BPF_COMPLEXITY_LIMIT_INSNS 65536 @@ -2344,7 +2349,7 @@ static int do_check(struct verifier_env *env) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; + enum bpf_reg_type *prev_src_type, src_reg_type; /* check for reserved fields is already done */ @@ -2374,16 +2379,18 @@ static int do_check(struct verifier_env *env) continue; } - if (insn->imm == 0) { + prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + + if (*prev_src_type == NOT_INIT) { /* saw a valid insn * dst_reg = *(u32 *)(src_reg + off) - * use reserved 'imm' field to mark this insn + * save type to validate intersecting paths */ - insn->imm = src_reg_type; + *prev_src_type = src_reg_type; - } else if (src_reg_type != insn->imm && + } else if (src_reg_type != *prev_src_type && (src_reg_type == PTR_TO_CTX || - insn->imm == PTR_TO_CTX)) { + *prev_src_type == PTR_TO_CTX)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: @@ -2396,7 +2403,7 @@ static int do_check(struct verifier_env *env) } } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; + enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -2424,11 +2431,13 @@ static int do_check(struct verifier_env *env) if (err) return err; - if (insn->imm == 0) { - insn->imm = dst_reg_type; - } else if (dst_reg_type != insn->imm && + prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + + if (*prev_dst_type == NOT_INIT) { + *prev_dst_type = dst_reg_type; + } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || - insn->imm == PTR_TO_CTX)) { + *prev_dst_type == PTR_TO_CTX)) { verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -2686,10 +2695,11 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) static int convert_ctx_accesses(struct verifier_env *env) { const struct bpf_verifier_ops *ops = env->prog->aux->ops; + const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, insn_cnt, cnt; + int i, cnt, delta = 0; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -2703,18 +2713,16 @@ static int convert_ctx_accesses(struct verifier_env *env) if (!new_prog) return -ENOMEM; env->prog = new_prog; + delta += cnt - 1; } } if (!ops->convert_ctx_access) return 0; - insn_cnt = env->prog->len; - insn = env->prog->insnsi; + insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { - u32 insn_delta; - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; @@ -2724,11 +2732,8 @@ static int convert_ctx_accesses(struct verifier_env *env) else continue; - if (insn->imm != PTR_TO_CTX) { - /* clear internal mark */ - insn->imm = 0; + if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) continue; - } cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, insn->off, insn_buf, env->prog); @@ -2737,18 +2742,16 @@ static int convert_ctx_accesses(struct verifier_env *env) return -EINVAL; } - new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); + new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, + cnt); if (!new_prog) return -ENOMEM; - insn_delta = cnt - 1; + delta += cnt - 1; /* keep walking new program and skip insns we just inserted */ env->prog = new_prog; - insn = new_prog->insnsi + i + insn_delta; - - insn_cnt += insn_delta; - i += insn_delta; + insn = new_prog->insnsi + i + delta; } return 0; @@ -2792,6 +2795,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env) return -ENOMEM; + env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * + (*prog)->len); + ret = -ENOMEM; + if (!env->insn_aux_data) + goto err_free_env; env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ @@ -2810,12 +2818,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* log_* values have to be sane */ if (log_size < 128 || log_size > UINT_MAX >> 8 || log_level == 0 || log_ubuf == NULL) - goto free_env; + goto err_unlock; ret = -ENOMEM; log_buf = vmalloc(log_size); if (!log_buf) - goto free_env; + goto err_unlock; } else { log_level = 0; } @@ -2884,14 +2892,16 @@ skip_full_check: free_log_buf: if (log_level) vfree(log_buf); -free_env: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); *prog = env->prog; - kfree(env); +err_unlock: mutex_unlock(&bpf_verifier_lock); + vfree(env->insn_aux_data); +err_free_env: + kfree(env); return ret; } -- cgit v1.2.3 From 58e2af8b3a6b587e4ac8414343581da4349d3c0f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:57 +0100 Subject: bpf: expose internal verfier structures Move verifier's internal structures to a header file and prefix their names with bpf_ to avoid potential namespace conflicts. Those structures will soon be used by external analyzers. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 266 +++++++++++++++++++------------------------------- 1 file changed, 103 insertions(+), 163 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a9542d89f293..dca2b9b1d02e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -126,82 +127,16 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ -struct reg_state { - enum bpf_reg_type type; - union { - /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ - s64 imm; - - /* valid when type == PTR_TO_PACKET* */ - struct { - u32 id; - u16 off; - u16 range; - }; - - /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | - * PTR_TO_MAP_VALUE_OR_NULL - */ - struct bpf_map *map_ptr; - }; -}; - -enum bpf_stack_slot_type { - STACK_INVALID, /* nothing was stored in this stack slot */ - STACK_SPILL, /* register spilled into stack */ - STACK_MISC /* BPF program wrote some data into this slot */ -}; - -#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ - -/* state of the program: - * type of all registers and stack info - */ -struct verifier_state { - struct reg_state regs[MAX_BPF_REG]; - u8 stack_slot_type[MAX_BPF_STACK]; - struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; -}; - -/* linked list of verifier states used to prune search */ -struct verifier_state_list { - struct verifier_state state; - struct verifier_state_list *next; -}; - /* verifier_state + insn_idx are pushed to stack when branch is encountered */ -struct verifier_stack_elem { +struct bpf_verifier_stack_elem { /* verifer state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx' */ - struct verifier_state st; + struct bpf_verifier_state st; int insn_idx; int prev_insn_idx; - struct verifier_stack_elem *next; -}; - -struct bpf_insn_aux_data { - enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ -}; - -#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ - -/* single container for all structs - * one verifier_env per bpf_check() call - */ -struct verifier_env { - struct bpf_prog *prog; /* eBPF program being verified */ - struct verifier_stack_elem *head; /* stack of verifier states to be processed */ - int stack_size; /* number of states to be processed */ - struct verifier_state cur_state; /* current verifier state */ - struct verifier_state_list **explored_states; /* search pruning optimization */ - struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ - u32 used_map_cnt; /* number of used maps */ - u32 id_gen; /* used to generate unique reg IDs */ - bool allow_ptr_leaks; - bool seen_direct_write; - struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ + struct bpf_verifier_stack_elem *next; }; #define BPF_COMPLEXITY_LIMIT_INSNS 65536 @@ -254,9 +189,9 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; -static void print_verifier_state(struct verifier_state *state) +static void print_verifier_state(struct bpf_verifier_state *state) { - struct reg_state *reg; + struct bpf_reg_state *reg; enum bpf_reg_type t; int i; @@ -432,9 +367,9 @@ static void print_bpf_insn(struct bpf_insn *insn) } } -static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) { - struct verifier_stack_elem *elem; + struct bpf_verifier_stack_elem *elem; int insn_idx; if (env->head == NULL) @@ -451,12 +386,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx) return insn_idx; } -static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, - int prev_insn_idx) +static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) { - struct verifier_stack_elem *elem; + struct bpf_verifier_stack_elem *elem; - elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; @@ -482,7 +417,7 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void init_reg_state(struct reg_state *regs) +static void init_reg_state(struct bpf_reg_state *regs) { int i; @@ -498,7 +433,7 @@ static void init_reg_state(struct reg_state *regs) regs[BPF_REG_1].type = PTR_TO_CTX; } -static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; @@ -511,7 +446,7 @@ enum reg_arg_type { DST_OP_NO_MARK /* same as above, check only, don't mark */ }; -static int check_reg_arg(struct reg_state *regs, u32 regno, +static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, enum reg_arg_type t) { if (regno >= MAX_BPF_REG) { @@ -571,8 +506,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct verifier_state *state, int off, int size, - int value_regno) +static int check_stack_write(struct bpf_verifier_state *state, int off, + int size, int value_regno) { int i; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -597,7 +532,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, } else { /* regular write of data into stack */ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = - (struct reg_state) {}; + (struct bpf_reg_state) {}; for (i = 0; i < size; i++) state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; @@ -605,7 +540,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, return 0; } -static int check_stack_read(struct verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_state *state, int off, int size, int value_regno) { u8 *slot_type; @@ -646,7 +581,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size, } /* check read/write into map element returned by bpf_map_lookup_elem() */ -static int check_map_access(struct verifier_env *env, u32 regno, int off, +static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; @@ -661,7 +596,7 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff -static bool may_access_direct_pkt_data(struct verifier_env *env, +static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta) { switch (env->prog->type) { @@ -678,11 +613,11 @@ static bool may_access_direct_pkt_data(struct verifier_env *env, } } -static int check_packet_access(struct verifier_env *env, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *reg = ®s[regno]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *reg = ®s[regno]; off += reg->off; if (off < 0 || size <= 0 || off + size > reg->range) { @@ -694,7 +629,7 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, } /* check access to 'struct bpf_context' fields */ -static int check_ctx_access(struct verifier_env *env, int off, int size, +static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { if (env->prog->aux->ops->is_valid_access && @@ -709,7 +644,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, return -EACCES; } -static bool is_pointer_value(struct verifier_env *env, int regno) +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { if (env->allow_ptr_leaks) return false; @@ -723,12 +658,13 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } -static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, - int off, int size) +static int check_ptr_alignment(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int off, int size) { if (reg->type != PTR_TO_PACKET) { if (off % size != 0) { - verbose("misaligned access off %d size %d\n", off, size); + verbose("misaligned access off %d size %d\n", + off, size); return -EACCES; } else { return 0; @@ -769,12 +705,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct verifier_env *env, u32 regno, int off, +static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno) { - struct verifier_state *state = &env->cur_state; - struct reg_state *reg = &state->regs[regno]; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *reg = &state->regs[regno]; int size, err = 0; if (reg->type == PTR_TO_STACK) @@ -860,9 +796,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, return err; } -static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; int err; if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || @@ -896,12 +832,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, int regno, +static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *regs = state->regs; int off, i; if (regs[regno].type != PTR_TO_STACK) { @@ -940,11 +876,11 @@ static int check_stack_boundary(struct verifier_env *env, int regno, return 0; } -static int check_func_arg(struct verifier_env *env, u32 regno, +static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; int err = 0; @@ -1149,10 +1085,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -static void clear_all_pkt_pointers(struct verifier_env *env) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs, *reg; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *regs = state->regs, *reg; int i; for (i = 0; i < MAX_BPF_REG; i++) @@ -1172,12 +1108,12 @@ static void clear_all_pkt_pointers(struct verifier_env *env) } } -static int check_call(struct verifier_env *env, int func_id) +static int check_call(struct bpf_verifier_env *env, int func_id) { - struct verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; - struct reg_state *regs = state->regs; - struct reg_state *reg; + struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1280,12 +1216,13 @@ static int check_call(struct verifier_env *env, int func_id) return 0; } -static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) +static int check_packet_ptr_add(struct bpf_verifier_env *env, + struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; - struct reg_state *src_reg = ®s[insn->src_reg]; - struct reg_state tmp_reg; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state tmp_reg; s32 imm; if (BPF_SRC(insn->code) == BPF_K) { @@ -1353,10 +1290,10 @@ add_imm: return 0; } -static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) +static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; u8 opcode = BPF_OP(insn->code); s64 imm_log2; @@ -1366,7 +1303,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) */ if (BPF_SRC(insn->code) == BPF_X) { - struct reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && dst_reg->imm && opcode == BPF_ADD) { @@ -1455,11 +1392,12 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) return 0; } -static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) +static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; - struct reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. @@ -1476,9 +1414,9 @@ static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) } /* check validity of 32-bit and 64-bit arithmetic operations */ -static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) +static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1652,10 +1590,10 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct verifier_state *state, - const struct reg_state *dst_reg) +static void find_good_pkt_pointers(struct bpf_verifier_state *state, + struct bpf_reg_state *dst_reg) { - struct reg_state *regs = state->regs, *reg; + struct bpf_reg_state *regs = state->regs, *reg; int i; /* LLVM can generate two kind of checks: @@ -1701,11 +1639,11 @@ static void find_good_pkt_pointers(struct verifier_state *state, } } -static int check_cond_jmp_op(struct verifier_env *env, +static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct verifier_state *other_branch, *this_branch = &env->cur_state; - struct reg_state *regs = this_branch->regs, *dst_reg; + struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; + struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1767,7 +1705,7 @@ static int check_cond_jmp_op(struct verifier_env *env, if (!other_branch) return -EFAULT; - /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { @@ -1809,9 +1747,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) } /* verify BPF_LD_IMM64 instruction */ -static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -1866,11 +1804,11 @@ static bool may_access_skb(enum bpf_prog_type type) * Output: * R0 - 8/16/32-bit skb data converted to cpu endianness */ -static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct reg_state *reg; + struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -1956,7 +1894,7 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) +#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) static int *insn_stack; /* stack of insns to process */ static int cur_stack; /* current stack index */ @@ -1967,7 +1905,7 @@ static int *insn_state; * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct verifier_env *env) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -2008,7 +1946,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env) /* non-recursive depth-first-search to detect loops in BPF program * loop == back-edge in directed graph */ -static int check_cfg(struct verifier_env *env) +static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2117,7 +2055,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) +static bool compare_ptrs_to_packet(struct bpf_reg_state *old, + struct bpf_reg_state *cur) { if (old->id != cur->id) return false; @@ -2192,9 +2131,10 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct verifier_state *old, struct verifier_state *cur) +static bool states_equal(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) { - struct reg_state *rold, *rcur; + struct bpf_reg_state *rold, *rcur; int i; for (i = 0; i < MAX_BPF_REG; i++) { @@ -2234,9 +2174,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) * the same, check that stored pointers types * are the same as well. * Ex: explored safe path could have stored - * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} + * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8} * but current path has stored: - * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} + * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16} * such verifier states are not equivalent. * return false to continue verification of this path */ @@ -2247,10 +2187,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) return true; } -static int is_state_visited(struct verifier_env *env, int insn_idx) +static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { - struct verifier_state_list *new_sl; - struct verifier_state_list *sl; + struct bpf_verifier_state_list *new_sl; + struct bpf_verifier_state_list *sl; sl = env->explored_states[insn_idx]; if (!sl) @@ -2274,7 +2214,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) * it will be rejected. Since there are no loops, we won't be * seeing this 'insn_idx' instruction again on the way to bpf_exit */ - new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); + new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); if (!new_sl) return -ENOMEM; @@ -2285,11 +2225,11 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) return 0; } -static int do_check(struct verifier_env *env) +static int do_check(struct bpf_verifier_env *env) { - struct verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = &env->cur_state; struct bpf_insn *insns = env->prog->insnsi; - struct reg_state *regs = state->regs; + struct bpf_reg_state *regs = state->regs; int insn_cnt = env->prog->len; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; @@ -2572,7 +2512,7 @@ static int check_map_prog_compatibility(struct bpf_map *map, /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ -static int replace_map_fd_with_map_ptr(struct verifier_env *env) +static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2669,7 +2609,7 @@ next_insn: } /* drop refcnt of maps used by the rejected program */ -static void release_maps(struct verifier_env *env) +static void release_maps(struct bpf_verifier_env *env) { int i; @@ -2678,7 +2618,7 @@ static void release_maps(struct verifier_env *env) } /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ -static void convert_pseudo_ld_imm64(struct verifier_env *env) +static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2692,7 +2632,7 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ -static int convert_ctx_accesses(struct verifier_env *env) +static int convert_ctx_accesses(struct bpf_verifier_env *env) { const struct bpf_verifier_ops *ops = env->prog->aux->ops; const int insn_cnt = env->prog->len; @@ -2757,9 +2697,9 @@ static int convert_ctx_accesses(struct verifier_env *env) return 0; } -static void free_states(struct verifier_env *env) +static void free_states(struct bpf_verifier_env *env) { - struct verifier_state_list *sl, *sln; + struct bpf_verifier_state_list *sl, *sln; int i; if (!env->explored_states) @@ -2782,16 +2722,16 @@ static void free_states(struct verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; - struct verifier_env *env; + struct bpf_verifier_env *env; int ret = -EINVAL; if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; - /* 'struct verifier_env' can be global, but since it's not small, + /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -2833,7 +2773,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) goto skip_full_check; env->explored_states = kcalloc(env->prog->len, - sizeof(struct verifier_state_list *), + sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; if (!env->explored_states) -- cgit v1.2.3 From 13a27dfc669724564aafa2699976ee756029fed2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:58 +0100 Subject: bpf: enable non-core use of the verfier Advanced JIT compilers and translators may want to use eBPF verifier as a base for parsers or to perform custom checks and validations. Add ability for external users to invoke the verifier and provide callbacks to be invoked for every intruction checked. For now only add most basic callback for per-instruction pre-interpretation checks is added. More advanced users may also like to have per-instruction post callback and state comparison callback. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dca2b9b1d02e..ee86a77dc40b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -632,6 +632,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { + /* for analyzer ctx accesses are already validated and converted */ + if (env->analyzer_ops) + return 0; + if (env->prog->aux->ops->is_valid_access && env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { /* remember the offset of last byte accessed in ctx */ @@ -2225,6 +2229,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; } +static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) +{ + if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) + return 0; + + return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); +} + static int do_check(struct bpf_verifier_env *env) { struct bpf_verifier_state *state = &env->cur_state; @@ -2283,6 +2296,10 @@ static int do_check(struct bpf_verifier_env *env) print_bpf_insn(insn); } + err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); + if (err) + return err; + if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -2845,3 +2862,54 @@ err_free_env: kfree(env); return ret; } + +int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, + void *priv) +{ + struct bpf_verifier_env *env; + int ret; + + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * + prog->len); + ret = -ENOMEM; + if (!env->insn_aux_data) + goto err_free_env; + env->prog = prog; + env->analyzer_ops = ops; + env->analyzer_priv = priv; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + log_level = 0; + + env->explored_states = kcalloc(env->prog->len, + sizeof(struct bpf_verifier_state_list *), + GFP_KERNEL); + ret = -ENOMEM; + if (!env->explored_states) + goto skip_full_check; + + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + + ret = do_check(env); + +skip_full_check: + while (pop_stack(env, NULL) >= 0); + free_states(env); + + mutex_unlock(&bpf_verifier_lock); + vfree(env->insn_aux_data); +err_free_env: + kfree(env); + return ret; +} +EXPORT_SYMBOL_GPL(bpf_analyzer); -- cgit v1.2.3 From 6b17387307bafc71624b9890b9239b6a438e2e89 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:59 +0100 Subject: bpf: recognize 64bit immediate loads as consts When running as parser interpret BPF_LD | BPF_IMM | BPF_DW instructions as loading CONST_IMM with the value stored in imm. The verifier will continue not recognizing those due to concerns about search space/program complexity increase. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ee86a77dc40b..8c3f794c7028 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1769,9 +1769,19 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; - if (insn->src_reg == 0) - /* generic move 64-bit immediate into a register */ + if (insn->src_reg == 0) { + /* generic move 64-bit immediate into a register, + * only analyzer needs to collect the ld_imm value. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + + if (!env->analyzer_ops) + return 0; + + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = imm; return 0; + } /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); -- cgit v1.2.3 From 4fa5cd5245b627db88c9ca08ae442373b02596b4 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 13 Sep 2016 16:27:05 +1000 Subject: sched/core: Do not use smp_processor_id() with preempt enabled in smpboot_thread_fn() We should not be using smp_processor_id() with preempt enabled. Bug identified and fix provided by Alfred Chen. Reported-by: Alfred Chen Signed-off-by: Con Kolivas Cc: Alfred Chen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/2042051.3vvUWIM0vs@hex Signed-off-by: Ingo Molnar --- kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 13bc43d1fb22..fc0d8270f69e 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data) if (kthread_should_park()) { __set_current_state(TASK_RUNNING); - preempt_enable(); if (ht->park && td->status == HP_THREAD_ACTIVE) { BUG_ON(td->cpu != smp_processor_id()); ht->park(td->cpu); td->status = HP_THREAD_PARKED; } + preempt_enable(); kthread_parkme(); /* We might have been woken for stop */ continue; -- cgit v1.2.3 From 8db549491c4a3ce9e1d509b75f78516e497f48ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 11 Sep 2016 10:36:26 +0200 Subject: smp: Allocate smp_call_on_cpu() workqueue on stack too The SMP IPI struct descriptor is allocated on the stack except for the workqueue and lockdep complains: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 0 PID: 110 Comm: kworker/0:1 Not tainted 4.8.0-rc5+ #14 Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A13 05/11/2014 Workqueue: events smp_call_on_cpu_callback ... Call Trace: dump_stack register_lock_class ? __lock_acquire __lock_acquire ? __lock_acquire lock_acquire ? process_one_work process_one_work ? process_one_work worker_thread ? process_one_work ? process_one_work kthread ? kthread_create_on_node ret_from_fork So allocate it on the stack too. Signed-off-by: Peter Zijlstra (Intel) [ Test and write commit message. ] Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160911084323.jhtnpb4b37t5tlno@pd.tnic Signed-off-by: Ingo Molnar --- kernel/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index f4f6137941cb..bba3b201668d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -759,13 +759,14 @@ static void smp_call_on_cpu_callback(struct work_struct *work) int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { struct smp_call_on_cpu_struct sscs = { - .work = __WORK_INITIALIZER(sscs.work, smp_call_on_cpu_callback), .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), .func = func, .data = par, .cpu = phys ? cpu : -1, }; + INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; -- cgit v1.2.3 From 0b8473570ce1af3e80da05b59f9321f30253de4d Mon Sep 17 00:00:00 2001 From: Cheng Chao Date: Wed, 14 Sep 2016 10:18:56 +0800 Subject: sched/core: Remove unnecessary initialization in sched_init() init_idle() is called immediately after: current->sched_class = &fair_sched_class; init_idle() sets: current->sched_class = &idle_sched_class; First assignment is superfluous. Signed-off-by: Cheng Chao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1473819536-7398-1-git-send-email-cs.os.kernel@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 860070fba814..c5f020c601b2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7557,11 +7557,6 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, -- cgit v1.2.3 From bf89a304722f6904009499a31dc68ab9a5c9742e Mon Sep 17 00:00:00 2001 From: Cheng Chao Date: Wed, 14 Sep 2016 10:01:50 +0800 Subject: stop_machine: Avoid a sleep and wakeup in stop_one_cpu() In case @cpu == smp_proccessor_id(), we can avoid a sleep+wakeup cycle by doing a preemption. Callers such as sched_exec() can benefit from this change. Signed-off-by: Cheng Chao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: chris@chris-wilson.co.uk Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1473818510-6779-1-git-send-email-cs.os.kernel@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++++-- kernel/stop_machine.c | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c5f020c601b2..ff4e3c066dc2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1063,8 +1063,12 @@ static int migration_cpu_stop(void *data) * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ - if (task_rq(p) == rq && task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + if (task_rq(p) == rq) { + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + else + p->wake_cpu = arg->dest_cpu; + } raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4a1ca5f6da7e..082e71f17a58 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -126,6 +126,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) return -ENOENT; + /* + * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup + * cycle by doing a preemption: + */ + cond_resched(); wait_for_completion(&done.completion); return done.ret; } -- cgit v1.2.3 From 9af6528ee9b682df7f29dbee86fbba0b67eab944 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Sep 2016 18:37:29 +0200 Subject: sched/core: Optimize __schedule() Oleg noted that by making do_exit() use __schedule() for the TASK_DEAD context switch, we can avoid the TASK_DEAD special case currently in __schedule() because that avoids the extra preempt_disable() from schedule(). In order to facilitate this, create a do_task_dead() helper which we place in the scheduler code, such that it can access __schedule(). Also add some __noreturn annotations to the functions, there's no coming back from do_exit(). Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Cheng Chao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: chris@chris-wilson.co.uk Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20160913163729.GB5012@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/exit.c | 26 ++------------------------ kernel/sched/core.c | 38 +++++++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 091a78be3b09..1e1d913914c0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -725,7 +725,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -void do_exit(long code) +void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -882,29 +882,7 @@ void do_exit(long code) exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ff4e3c066dc2..b2ec53c1a974 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3331,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - /* - * do_exit() calls schedule() with preemption disabled as an exception; - * however we must fix that up, otherwise the next task will see an - * inconsistent (higher) preempt count. - * - * It also avoids the below schedule_debug() test from complaining - * about this. - */ - if (unlikely(prev->state == TASK_DEAD)) - preempt_enable_no_resched_notrace(); - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3409,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt) } STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ +void __noreturn do_task_dead(void) +{ + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(¤t->pi_lock); + + /* causes final put_task_struct in finish_task_switch(). */ + __set_current_state(TASK_DEAD); + current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + __schedule(false); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} + static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) -- cgit v1.2.3 From 35a773a07926a22bf19d77ee00024522279c4e68 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Sep 2016 12:57:53 +0200 Subject: sched/core: Avoid _cond_resched() for PREEMPT=y On fully preemptible kernels _cond_resched() is pointless, so avoid emitting any code for it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mikulas Patocka Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b2ec53c1a974..d7babcc7cb76 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4883,6 +4883,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +#ifndef CONFIG_PREEMPT int __sched _cond_resched(void) { if (should_resched(0)) { @@ -4892,6 +4893,7 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); +#endif /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -- cgit v1.2.3 From 3bf6215a1b30db7df6083c708caab3fe1a8e8abe Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 20 Sep 2016 18:48:11 +0300 Subject: perf/core: Limit matching exclusive events to one PMU An "exclusive" PMU is the one that can only have one event scheduled in at any given time. There may be more than one of such PMUs in a system, though, like Intel PT and BTS. It should be allowed to have one event for either of those inside the same context (there may be other constraints that may prevent this, but those would be hardware-specific). However, the exclusivity code is written so that only one event from any of the "exclusive" PMUs is allowed in a context. Fix this by making the exclusive event filter explicitly match two events' PMUs. Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160920154811.3255-3-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a54f2c2cdb20..fc9bb2225291 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3929,7 +3929,7 @@ static void exclusive_event_destroy(struct perf_event *event) static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { - if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) -- cgit v1.2.3 From 8bf46a39be910937d4c9e8d999a7438a7ae1a75b Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 16 Sep 2016 18:28:51 -0700 Subject: sched/fair: Fix SCHED_HRTICK bug leading to late preemption of tasks SCHED_HRTICK feature is useful to preempt SCHED_FAIR tasks on-the-dot (just when they would have exceeded their ideal_runtime). It makes use of a per-CPU hrtimer resource and hence arming that hrtimer should be based on total SCHED_FAIR tasks a CPU has across its various cfs_rqs, rather than being based on number of tasks in a particular cfs_rq (as implemented currently). As a result, with current code, its possible for a running task (which is the sole task in its cfs_rq) to be preempted much after its ideal_runtime has elapsed, resulting in increased latency for tasks in other cfs_rq on same CPU. Fix this by arming sched hrtimer based on total number of SCHED_FAIR tasks a CPU has across its various cfs_rqs. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Joonwoo Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1474075731-11550-1-git-send-email-joonwoop@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 986c10c25176..8fb4d1942c14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4469,7 +4469,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; -- cgit v1.2.3 From a18a579e5f84daa74f64b1f1b652b4a6a8d6f8b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Sep 2016 11:05:31 +0200 Subject: sched/debug: Hide printk() by default Dietmar accidentally added an unconditional sched domain printk. Hide it behind the normal sched_debug flag. Reported-by: Christian Borntraeger Signed-off-by: Peter Zijlstra (Intel) Acked-by: Christian Borntraeger Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: cd92bfd3b8cb ("sched/core: Store maximum per-CPU capacity in root domain") [ Fixed !SCHED_DEBUG build failure. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d7babcc7cb76..8bae0cd09e9e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5739,6 +5739,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } } #else /* !CONFIG_SCHED_DEBUG */ + +# define sched_debug_enabled 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -7006,7 +7008,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, } rcu_read_unlock(); - if (rq) { + if (rq && sched_debug_enabled) { pr_info("span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); } -- cgit v1.2.3 From b193049375b04df3ada8c3347b7083db95918bc3 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Mon, 19 Sep 2016 05:23:52 -0400 Subject: locking/pv-qspinlock: Use cmpxchg_release() in __pv_queued_spin_unlock() cmpxchg_release() is more lighweight than cmpxchg() on some archs(e.g. PPC), moreover, in __pv_queued_spin_unlock() we only needs a RELEASE in the fast path(pairing with *_try_lock() or *_lock()). And the slow path has smp_store_release too. So it's safe to use cmpxchg_release here. Suggested-by: Boqun Feng Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: benh@kernel.crashing.org Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: paulmck@linux.vnet.ibm.com Cc: paulus@samba.org Cc: virtualization@lists.linux-foundation.org Cc: waiman.long@hpe.com Link: http://lkml.kernel.org/r/1474277037-15200-2-git-send-email-xinhui.pan@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 3acf16d79cf4..e3b5520005db 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -540,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); if (likely(locked == _Q_LOCKED_VAL)) return; -- cgit v1.2.3 From e6253970413d99f416f7de8bd516e5f1834d8216 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 21 Nov 2015 19:11:48 +0100 Subject: stop_machine: Remove stop_cpus_lock and lg_double_lock/unlock() stop_two_cpus() and stop_cpus() use stop_cpus_lock to avoid the deadlock, we need to ensure that the stopper functions can't be queued "backwards" from one another. This doesn't look nice; if we use lglock then we do not really need stopper->lock, cpu_stop_queue_work() could use lg_local_lock() under local_irq_save(). OTOH it would be even better to avoid lglock in stop_machine.c and remove lg_double_lock(). This patch adds "bool stop_cpus_in_progress" set/cleared by queue_stop_cpus_work(), and changes cpu_stop_queue_two_works() to busy wait until it is cleared. queue_stop_cpus_work() sets stop_cpus_in_progress = T lockless, but after it queues a work on CPU1 it must be visible to stop_two_cpus(CPU1, CPU2) which checks it under the same lock. And since stop_two_cpus() holds the 2nd lock too, queue_stop_cpus_work() can not clear stop_cpus_in_progress if it is also going to queue a work on CPU2, it needs to take that 2nd lock to do this. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151121181148.GA433@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lglock.c | 22 ---------------------- kernel/stop_machine.c | 42 ++++++++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c index 951cfcd10b4a..86ae2aebf004 100644 --- a/kernel/locking/lglock.c +++ b/kernel/locking/lglock.c @@ -60,28 +60,6 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) } EXPORT_SYMBOL(lg_local_unlock_cpu); -void lg_double_lock(struct lglock *lg, int cpu1, int cpu2) -{ - BUG_ON(cpu1 == cpu2); - - /* lock in cpu order, just like lg_global_lock */ - if (cpu2 < cpu1) - swap(cpu1, cpu2); - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu2)); -} - -void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2) -{ - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2)); - preempt_enable(); -} - void lg_global_lock(struct lglock *lg) { int i; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4a1ca5f6da7e..ae6f41fb9cba 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -20,7 +20,6 @@ #include #include #include -#include #include /* @@ -47,13 +46,9 @@ struct cpu_stopper { static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static bool stop_machine_initialized = false; -/* - * Avoids a race between stop_two_cpus and global stop_cpus, where - * the stoppers could get queued up in reverse order, leading to - * system deadlock. Using an lglock means stop_two_cpus remains - * relatively cheap. - */ -DEFINE_STATIC_LGLOCK(stop_cpus_lock); +/* static data for stop_cpus */ +static DEFINE_MUTEX(stop_cpus_mutex); +static bool stop_cpus_in_progress; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { @@ -230,14 +225,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); int err; - - lg_double_lock(&stop_cpus_lock, cpu1, cpu2); +retry: spin_lock_irq(&stopper1->lock); spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); err = -ENOENT; if (!stopper1->enabled || !stopper2->enabled) goto unlock; + /* + * Ensure that if we race with __stop_cpus() the stoppers won't get + * queued up in reverse order leading to system deadlock. + * + * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has + * queued a work on cpu1 but not on cpu2, we hold both locks. + * + * It can be falsely true but it is safe to spin until it is cleared, + * queue_stop_cpus_work() does everything under preempt_disable(). + */ + err = -EDEADLK; + if (unlikely(stop_cpus_in_progress)) + goto unlock; err = 0; __cpu_stop_queue_work(stopper1, work1); @@ -245,8 +252,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, unlock: spin_unlock(&stopper2->lock); spin_unlock_irq(&stopper1->lock); - lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + if (unlikely(err == -EDEADLK)) { + while (stop_cpus_in_progress) + cpu_relax(); + goto retry; + } return err; } /** @@ -316,9 +327,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, return cpu_stop_queue_work(cpu, work_buf); } -/* static data for stop_cpus */ -static DEFINE_MUTEX(stop_cpus_mutex); - static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, struct cpu_stop_done *done) @@ -332,7 +340,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ - lg_global_lock(&stop_cpus_lock); + preempt_disable(); + stop_cpus_in_progress = true; for_each_cpu(cpu, cpumask) { work = &per_cpu(cpu_stopper.stop_work, cpu); work->fn = fn; @@ -341,7 +350,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, if (cpu_stop_queue_work(cpu, work)) queued = true; } - lg_global_unlock(&stop_cpus_lock); + stop_cpus_in_progress = false; + preempt_enable(); return queued; } -- cgit v1.2.3 From d32cdbfb0ba319e44f75437afde868f7cafdc467 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Nov 2015 18:36:16 +0100 Subject: locking/lglock: Remove lglock implementation It is now unused, remove it before someone else thinks its a good idea to use this. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/Makefile | 1 - kernel/locking/lglock.c | 89 ------------------------------------------------- 2 files changed, 90 deletions(-) delete mode 100644 kernel/locking/lglock.c (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 31322a4275cd..6f88e352cd4f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o -obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c deleted file mode 100644 index 86ae2aebf004..000000000000 --- a/kernel/locking/lglock.c +++ /dev/null @@ -1,89 +0,0 @@ -/* See include/linux/lglock.h for description */ -#include -#include -#include -#include - -/* - * Note there is no uninit, so lglocks cannot be defined in - * modules (but it's fine to use them from there) - * Could be added though, just undo lg_lock_init - */ - -void lg_lock_init(struct lglock *lg, char *name) -{ - LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); -} -EXPORT_SYMBOL(lg_lock_init); - -void lg_local_lock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock); - -void lg_local_unlock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock); - -void lg_local_lock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock_cpu); - -void lg_local_unlock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock_cpu); - -void lg_global_lock(struct lglock *lg) -{ - int i; - - preempt_disable(); - lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_lock(lock); - } -} -EXPORT_SYMBOL(lg_global_lock); - -void lg_global_unlock(struct lglock *lg) -{ - int i; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_unlock(lock); - } - preempt_enable(); -} -EXPORT_SYMBOL(lg_global_unlock); -- cgit v1.2.3 From df75e7748bae1c7098bfa358485389b897f71305 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 22 Sep 2016 13:08:36 -0500 Subject: userns: When the per user per user namespace limit is reached return ENOSPC The current error codes returned when a the per user per user namespace limit are hit (EINVAL, EUSERS, and ENFILE) are wrong. I asked for advice on linux-api and it we made clear that those were the wrong error code, but a correct effor code was not suggested. The best general error code I have found for hitting a resource limit is ENOSPC. It is not perfect but as it is unambiguous it will serve until someone comes up with a better error code. Signed-off-by: "Eric W. Biederman" --- kernel/cgroup.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9e4427fec46..f1dd4b076210 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6354,7 +6354,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, ucounts = inc_cgroup_namespaces(user_ns); if (!ucounts) - return ERR_PTR(-ENFILE); + return ERR_PTR(-ENOSPC); /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 30a7f3351932..7542b28cc929 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -98,7 +98,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns int i; int err; - err = -EINVAL; + err = -ENOSPC; if (level > MAX_PID_NS_LEVEL) goto out; ucounts = inc_pid_namespaces(user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0edafe305861..f2c5ba5505f1 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -76,7 +76,7 @@ int create_user_ns(struct cred *new) struct ucounts *ucounts; int ret, i; - ret = -EUSERS; + ret = -ENOSPC; if (parent_ns->level > 32) goto fail; diff --git a/kernel/utsname.c b/kernel/utsname.c index f3b0bb4ac3ba..35587b76faa3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -49,7 +49,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct ucounts *ucounts; int err; - err = -ENFILE; + err = -ENOSPC; ucounts = inc_uts_namespaces(user_ns); if (!ucounts) goto fail; -- cgit v1.2.3 From bd6c92221d2081045cbc5f13c8c1de77e3ff569e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 8 Sep 2016 13:41:01 -0500 Subject: config: move x86 kvm_guest.config to a common location MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_guest.config is useful for KVM guests on other arches, and nothing in it appears to be x86 specific, so just move the whole file. Kbuild will find it in either location. Signed-off-by: Rob Herring Cc: Christoffer Dall Cc: Marc Zyngier Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: kvmarm@lists.cs.columbia.edu Cc: kvm@vger.kernel.org Acked-by: Christoffer Dall Signed-off-by: Radim Krčmář --- kernel/configs/kvm_guest.config | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 kernel/configs/kvm_guest.config (limited to 'kernel') diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config new file mode 100644 index 000000000000..9906505c998a --- /dev/null +++ b/kernel/configs/kvm_guest.config @@ -0,0 +1,31 @@ +CONFIG_NET=y +CONFIG_NET_CORE=y +CONFIG_NETDEVICES=y +CONFIG_BLOCK=y +CONFIG_BLK_DEV=y +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_INET=y +CONFIG_TTY=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_BINFMT_ELF=y +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_DEBUG_KERNEL=y +CONFIG_VIRTUALIZATION=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_KVM_GUEST=y +CONFIG_VIRTIO=y +CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_BLK=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_NET=y +CONFIG_9P_FS=y +CONFIG_NET_9P=y +CONFIG_NET_9P_VIRTIO=y +CONFIG_SCSI_LOWLEVEL=y +CONFIG_SCSI_VIRTIO=y +CONFIG_VIRTIO_INPUT=y -- cgit v1.2.3 From ce836c2974dd9f1ebeb73882cec2a83b8830c749 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 8 Sep 2016 13:41:02 -0500 Subject: kvmconfig: add virtio-gpu to config fragment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virtio-gpu is used for VMs, so add it to the kvm config. Signed-off-by: Rob Herring Cc: Christoffer Dall Cc: Marc Zyngier Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: kvmarm@lists.cs.columbia.edu Cc: kvm@vger.kernel.org [expanded "frag" to "fragment" in summary] Signed-off-by: Radim Krčmář --- kernel/configs/kvm_guest.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 9906505c998a..8d9643767142 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -29,3 +29,4 @@ CONFIG_NET_9P_VIRTIO=y CONFIG_SCSI_LOWLEVEL=y CONFIG_SCSI_VIRTIO=y CONFIG_VIRTIO_INPUT=y +CONFIG_DRM_VIRTIO_GPU=y -- cgit v1.2.3 From bcac25a58bfc6bd79191ac5d7afb49bea96da8c9 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 6 Sep 2016 00:47:13 -0700 Subject: kernel: add a helper to get an owning user namespace for a namespace Return -EPERM if an owning user namespace is outside of a process current user namespace. v2: In a first version ns_get_owner returned ENOENT for init_user_ns. This special cases was removed from this version. There is nothing outside of init_user_ns, so we can return EPERM. v3: rename ns->get_owner() to ns->owner(). get_* usually means that it grabs a reference. Acked-by: Serge Hallyn Signed-off-by: Andrei Vagin Signed-off-by: Eric W. Biederman --- kernel/cgroup.c | 6 ++++++ kernel/pid_namespace.c | 6 ++++++ kernel/user_namespace.c | 24 ++++++++++++++++++++++++ kernel/utsname.c | 6 ++++++ 4 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..86b0e8b16426 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6403,12 +6403,18 @@ static void cgroupns_put(struct ns_common *ns) put_cgroup_ns(to_cg_ns(ns)); } +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, + .owner = cgroupns_owner, }; static __init int cgroup_namespaces_init(void) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba137fd15..c02d744225e1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -388,12 +388,18 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct user_namespace *pidns_owner(struct ns_common *ns) +{ + return to_pid_ns(ns)->user_ns; +} + const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, + .owner = pidns_owner, }; static __init int pid_namespaces_init(void) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 68f594212759..0ef683a03c20 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1004,12 +1004,36 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) return commit_creds(cred); } +struct ns_common *ns_get_owner(struct ns_common *ns) +{ + struct user_namespace *my_user_ns = current_user_ns(); + struct user_namespace *owner, *p; + + /* See if the owner is in the current user namespace */ + owner = p = ns->ops->owner(ns); + for (;;) { + if (!p) + return ERR_PTR(-EPERM); + if (p == my_user_ns) + break; + p = p->parent; + } + + return &get_user_ns(owner)->ns; +} + +static struct user_namespace *userns_owner(struct ns_common *ns) +{ + return to_user_ns(ns)->parent; +} + const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, + .owner = userns_owner, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index 831ea7108232..e1211a8a5c18 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -130,10 +130,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) return 0; } +static struct user_namespace *utsns_owner(struct ns_common *ns) +{ + return to_uts_ns(ns)->user_ns; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .owner = utsns_owner, }; -- cgit v1.2.3 From a7306ed8d94af729ecef8b6e37506a1c6fc14788 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 6 Sep 2016 00:47:15 -0700 Subject: nsfs: add ioctl to get a parent namespace Pid and user namepaces are hierarchical. There is no way to discover parent-child relationships. In a future we will use this interface to dump and restore nested namespaces. Acked-by: Serge Hallyn Signed-off-by: Andrei Vagin Signed-off-by: Eric W. Biederman --- kernel/pid_namespace.c | 19 +++++++++++++++++++ kernel/user_namespace.c | 1 + 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c02d744225e1..4fa2d56a936c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -388,6 +388,24 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct ns_common *pidns_get_parent(struct ns_common *ns) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *pid_ns, *p; + + /* See if the parent is in the current namespace */ + pid_ns = p = to_pid_ns(ns)->parent; + for (;;) { + if (!p) + return ERR_PTR(-EPERM); + if (p == active) + break; + p = p->parent; + } + + return &get_pid_ns(pid_ns)->ns; +} + static struct user_namespace *pidns_owner(struct ns_common *ns) { return to_pid_ns(ns)->user_ns; @@ -400,6 +418,7 @@ const struct proc_ns_operations pidns_operations = { .put = pidns_put, .install = pidns_install, .owner = pidns_owner, + .get_parent = pidns_get_parent, }; static __init int pid_namespaces_init(void) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0ef683a03c20..a58a219b99c6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1034,6 +1034,7 @@ const struct proc_ns_operations userns_operations = { .put = userns_put, .install = userns_install, .owner = userns_owner, + .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) -- cgit v1.2.3 From a0d0c6216afad4b2b1704a72bd76bea259e07655 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 9 Sep 2016 01:05:45 +0900 Subject: tracing: Call traceoff trigger after event is recorded Call traceoff trigger after the event is recorded. Since current traceoff trigger is called before recording the event, we can not know what event stopped tracing. Typical usecase of traceoff/traceon trigger is tracing function calls and trace events between a pair of events. For example, trace function calls between syscall entry/exit. In that case, it is useful if we can see the return code of the target syscall. Link: http://lkml.kernel.org/r/147335074530.12462.4526186083406015005.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_trigger.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571cde24..6721a1e89f39 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = { static struct event_command trigger_traceoff_cmd = { .name = "traceoff", .trigger_type = ETT_TRACE_ONOFF, + .flags = EVENT_CMD_FL_POST_TRIGGER, .func = event_trigger_callback, .reg = register_trigger, .unreg = unregister_trigger, -- cgit v1.2.3 From 9157056da8f8c4a6305f15619e269f164b63a6de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 23 Sep 2016 16:55:49 -0400 Subject: cgroup: fix invalid controller enable rejections with cgroup namespace On the v2 hierarchy, "cgroup.subtree_control" rejects controller enables if the cgroup has processes in it. The enforcement of this logic assumes that the cgroup wouldn't have any css_sets associated with it if there are no tasks in the cgroup, which is no longer true since a79a908fd2b0 ("cgroup: introduce cgroup namespaces"). When a cgroup namespace is created, it pins the css_set of the creating task to use it as the root css_set of the namespace. This extra reference stays as long as the namespace is around and makes "cgroup.subtree_control" think that the namespace root cgroup is not empty even when it is and thus reject controller enables. Fix it by making cgroup_subtree_control() walk and test emptiness of each css_set instead of testing whether the list_head is empty. While at it, update the comment of cgroup_task_count() to indicate that the returned value may be higher than the number of tasks, which has always been true due to temporary references and doesn't break anything. Signed-off-by: Tejun Heo Reported-by: Evgeny Vereshchagin Cc: Serge E. Hallyn Cc: Aditya Kali Cc: Eric W. Biederman Cc: stable@vger.kernel.org # v4.6+ Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Link: https://github.com/systemd/systemd/pull/3589#issuecomment-249089541 --- kernel/cgroup.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..0d4ee1ea5c31 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3446,9 +3446,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { - ret = -EBUSY; - goto out_unlock; + if (enable && cgroup_parent(cgrp)) { + struct cgrp_cset_link *link; + + /* + * Because namespaces pin csets too, @cgrp->cset_links + * might not be empty even when @cgrp is empty. Walk and + * verify each cset. + */ + spin_lock_irq(&css_set_lock); + + ret = 0; + list_for_each_entry(link, &cgrp->cset_links, cset_link) { + if (css_set_populated(link->cset)) { + ret = -EBUSY; + break; + } + } + + spin_unlock_irq(&css_set_lock); + + if (ret) + goto out_unlock; } /* save and update control masks and prepare csses */ @@ -3899,7 +3918,9 @@ void cgroup_file_notify(struct cgroup_file *cfile) * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * - * Return the number of tasks in the cgroup. + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { -- cgit v1.2.3 From 1245800c0f96eb6ebb368593e251d66c01e61022 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 23 Sep 2016 22:57:13 -0400 Subject: tracing: Move mutex to protect against resetting of seq data The iter->seq can be reset outside the protection of the mutex. So can reading of user data. Move the mutex up to the beginning of the function. Fixes: d7350c3f45694 ("tracing/core: make the read callbacks reentrants") Cc: stable@vger.kernel.org # 2.6.30+ Reported-by: Al Viro Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a4bd6b68a0b..8fb4847b0450 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4890,19 +4890,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; ssize_t sret; - /* return any leftover data */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (sret != -EBUSY) - return sret; - - trace_seq_init(&iter->seq); - /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself * is protected. */ mutex_lock(&iter->mutex); + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + goto out; + + trace_seq_init(&iter->seq); + if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) -- cgit v1.2.3 From 1ae2293dd6d2f5c823cf97e60b70d03631cd622f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 17 Sep 2016 18:31:46 -0400 Subject: fix memory leaks in tracing_buffers_splice_read() Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- kernel/trace/trace.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8fb4847b0450..77eeab2776ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5930,9 +5930,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - if (*ppos & (PAGE_SIZE - 1)) return -EINVAL; @@ -5942,6 +5939,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + again: trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); @@ -5999,19 +5999,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { if (ret) - return ret; + goto out; + ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) - return -EAGAIN; + goto out; ret = wait_on_pipe(iter, true); if (ret) - return ret; + goto out; goto again; } ret = splice_to_pipe(pipe, &spd); +out: splice_shrink_spd(&spd); return ret; -- cgit v1.2.3 From b8129a1f6aaaca02d92186acf19ceb545b4b489a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 25 Sep 2016 15:36:39 +0000 Subject: genirq: Make function __irq_do_set_handler() static Fixes the following sparse warning: kernel/irq/chip.c:786:1: warning: symbol '__irq_do_set_handler' was not declared. Should it be static? Signed-off-by: Wei Yongjun Link: http://lkml.kernel.org/r/1474817799-18676-1-git-send-email-weiyj.lk@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d8dfdc630b7e..be3c34e4f2ac 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -782,7 +782,7 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } -void +static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) { -- cgit v1.2.3 From 1955351da41caa1dbf4139191358fed84909d64b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Sat, 24 Sep 2016 20:01:50 +0200 Subject: bpf: Set register type according to is_valid_access() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This prevent future potential pointer leaks when an unprivileged eBPF program will read a pointer value from its context. Even if is_valid_access() returns a pointer type, the eBPF verifier replace it with UNKNOWN_VALUE. The register value that contains a kernel address is then allowed to leak. Moreover, this fix allows unprivileged eBPF programs to use functions with (legitimate) pointer arguments. Not an issue currently since reg_type is only set for PTR_TO_PACKET or PTR_TO_PACKET_END in XDP and TC programs that can only be loaded as privileged. For now, the only unprivileged eBPF program allowed is for socket filtering and all the types from its context are UNKNOWN_VALUE. However, this fix is important for future unprivileged eBPF programs which could use pointers in their context. Signed-off-by: Mickaël Salaün Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c3f794c7028..7ada3152a556 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -749,9 +749,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, err = check_ctx_access(env, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value(state->regs, value_regno); - if (env->allow_ptr_leaks) - /* note that reg.[id|off|range] == 0 */ - state->regs[value_regno].type = reg_type; + /* note that reg.[id|off|range] == 0 */ + state->regs[value_regno].type = reg_type; } } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { -- cgit v1.2.3 From e0e0be8a835520e2f7c89f214dfda570922a1b90 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 27 Sep 2016 11:03:57 +0200 Subject: libfs: support RENAME_NOREPLACE in simple_rename() This is trivial to do: - add flags argument to simple_rename() - check if flags doesn't have any other than RENAME_NOREPLACE - assign simple_rename() to .rename2 instead of .rename Filesystems converted: hugetlbfs, ramfs, bpf. Debugfs uses simple_rename() to implement debugfs_rename(), which is for debugfs instances to rename files internally, not for userspace filesystem access. For this case pass zero flags to simple_rename(). Signed-off-by: Miklos Szeredi Acked-by: Greg Kroah-Hartman Cc: Alexei Starovoitov --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5967b870a895..c92fd8936d33 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -189,7 +189,7 @@ static const struct inode_operations bpf_dir_iops = { .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, - .rename = simple_rename, + .rename2 = simple_rename, .link = simple_link, .unlink = simple_unlink, }; -- cgit v1.2.3 From 2773bf00aeb9bf39e022463272a61dd0ec9f55f4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 27 Sep 2016 11:03:58 +0200 Subject: fs: rename "rename2" i_op to "rename" Generated patch: sed -i "s/\.rename2\t/\.rename\t\t/" `git grep -wl rename2` sed -i "s/\brename2\b/rename/g" `git grep -wl rename2` Signed-off-by: Miklos Szeredi --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index c92fd8936d33..5967b870a895 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -189,7 +189,7 @@ static const struct inode_operations bpf_dir_iops = { .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, - .rename2 = simple_rename, + .rename = simple_rename, .link = simple_link, .unlink = simple_unlink, }; -- cgit v1.2.3 From 9b80a184eaadc117f27faad522008f31d571621b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 2 Sep 2016 00:38:52 +0300 Subject: fs/file: more unsigned file descriptors Propagate unsignedness for grand total of 149 bytes: $ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux add/remove: 0/0 grow/shrink: 0/10 up/down: 0/-149 (-149) function old new delta set_close_on_exec 99 98 -1 put_files_struct 201 200 -1 get_close_on_exec 59 58 -1 do_prlimit 498 497 -1 do_execveat_common.isra 1662 1661 -1 __close_fd 178 173 -5 do_dup2 219 204 -15 seq_show 685 660 -25 __alloc_fd 384 357 -27 dup_fd 718 646 -72 It mostly comes from converting "unsigned int" to "long" for bit operations. Signed-off-by: Alexey Dobriyan Signed-off-by: Al Viro --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a13bbdaab47d..d66321036f21 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -108,7 +108,7 @@ extern int pid_max_min, pid_max_max; extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; -extern int sysctl_nr_open_min, sysctl_nr_open_max; +extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif @@ -1692,7 +1692,7 @@ static struct ctl_table fs_table[] = { { .procname = "nr_open", .data = &sysctl_nr_open, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &sysctl_nr_open_min, -- cgit v1.2.3 From 078cd8279e659989b103359bb22373cc79445bde Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Wed, 14 Sep 2016 07:48:04 -0700 Subject: fs: Replace CURRENT_TIME with current_time() for inode timestamps CURRENT_TIME macro is not appropriate for filesystems as it doesn't use the right granularity for filesystem timestamps. Use current_time() instead. CURRENT_TIME is also not y2038 safe. This is also in preparation for the patch that transitions vfs timestamps to use 64 bit time and hence make them y2038 safe. As part of the effort current_time() will be extended to do range checks. Hence, it is necessary for all file system timestamps to use current_time(). Also, current_time() will be transitioned along with vfs to be y2038 safe. Note that whenever a single call to current_time() is used to change timestamps in different inodes, it is because they share the same time granularity. Signed-off-by: Deepa Dinamani Reviewed-by: Arnd Bergmann Acked-by: Felipe Balbi Acked-by: Steven Whitehouse Acked-by: Ryusuke Konishi Acked-by: David Sterba Signed-off-by: Al Viro --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5967b870a895..1ed8473ec537 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -97,7 +97,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, return ERR_PTR(-ENOSPC); inode->i_ino = get_next_ino(); - inode->i_atime = CURRENT_TIME; + inode->i_atime = current_time(inode); inode->i_mtime = inode->i_atime; inode->i_ctime = inode->i_atime; -- cgit v1.2.3 From 9dcfcda5768eda793e15a1a73da38cfd1fc1a47a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 21 Sep 2016 09:45:24 +1000 Subject: compat: remove compat_printk() After 7e8e385aaf6e ("x86/compat: Remove sys32_vm86_warning"), this function has become unused, so we can remove it as well. Link: http://lkml.kernel.org/r/20160617142903.3070388-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Alexander Viro Cc: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Signed-off-by: Andrew Morton --- kernel/sysctl.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d66321036f21..98ba312d1088 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit; extern int pid_max; extern int pid_max_min, pid_max_max; extern int percpu_pagelist_fraction; -extern int compat_log; extern int latencytop_enabled; extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU @@ -1084,15 +1083,6 @@ static struct ctl_table kern_table[] = { .extra1 = &neg_one, }, #endif -#ifdef CONFIG_COMPAT - { - .procname = "compat-log", - .data = &compat_log, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_RT_MUTEXES { .procname = "max_lock_depth", -- cgit v1.2.3 From b761fe226be61eba6ae20a424b288559b8a606b5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 27 Sep 2016 08:42:41 -0700 Subject: bpf: clean up put_cpu_var usage put_cpu_var takes the percpu data, not the data returned from get_cpu_var. This doesn't change the behavior. Cc: Tejun Heo Cc: Alexei Starovoitov Signed-off-by: Shaohua Li Acked-by: Alexei Starovoitov Acked-by: Tejun Heo Signed-off-by: David S. Miller --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7b7baaed9ed4..aa6d98154106 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1031,7 +1031,7 @@ BPF_CALL_0(bpf_user_rnd_u32) state = &get_cpu_var(bpf_user_rnd_state); res = prandom_u32_state(state); - put_cpu_var(state); + put_cpu_var(bpf_user_rnd_state); return res; } -- cgit v1.2.3 From 484611357c19f9e19ef742ebef4505a07d243cc9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 28 Sep 2016 10:54:32 -0400 Subject: bpf: allow access into map value arrays Suppose you have a map array value that is something like this struct foo { unsigned iter; int array[SOME_CONSTANT]; }; You can easily insert this into an array, but you cannot modify the contents of foo->array[] after the fact. This is because we have no way to verify we won't go off the end of the array at verification time. This patch provides a start for this work. We accomplish this by keeping track of a minimum and maximum value a register could be while we're checking the code. Then at the time we try to do an access into a MAP_VALUE we verify that the maximum offset into that region is a valid access into that memory region. So in practice, code such as this unsigned index = 0; if (foo->iter >= SOME_CONSTANT) foo->iter = index; else index = foo->iter++; foo->array[index] = bar; would be allowed, as we can verify that index will always be between 0 and SOME_CONSTANT-1. If you wish to use signed values you'll have to have an extra check to make sure the index isn't less than 0, or do something like index %= SOME_CONSTANT. Signed-off-by: Josef Bacik Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 329 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 311 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7ada3152a556..99a7e5b388f2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -182,6 +182,7 @@ static const char * const reg_type_str[] = { [CONST_PTR_TO_MAP] = "map_ptr", [PTR_TO_MAP_VALUE] = "map_value", [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj", [FRAME_PTR] = "fp", [PTR_TO_STACK] = "fp", [CONST_IMM] = "imm", @@ -209,10 +210,17 @@ static void print_verifier_state(struct bpf_verifier_state *state) else if (t == UNKNOWN_VALUE && reg->imm) verbose("%lld", reg->imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) + t == PTR_TO_MAP_VALUE_OR_NULL || + t == PTR_TO_MAP_VALUE_ADJ) verbose("(ks=%d,vs=%d)", reg->map_ptr->key_size, reg->map_ptr->value_size); + if (reg->min_value != BPF_REGISTER_MIN_RANGE) + verbose(",min_value=%llu", + (unsigned long long)reg->min_value); + if (reg->max_value != BPF_REGISTER_MAX_RANGE) + verbose(",max_value=%llu", + (unsigned long long)reg->max_value); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) @@ -424,6 +432,8 @@ static void init_reg_state(struct bpf_reg_state *regs) for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; + regs[i].min_value = BPF_REGISTER_MIN_RANGE; + regs[i].max_value = BPF_REGISTER_MAX_RANGE; } /* frame pointer */ @@ -440,6 +450,12 @@ static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) regs[regno].imm = 0; } +static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) +{ + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -665,7 +681,7 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) static int check_ptr_alignment(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int off, int size) { - if (reg->type != PTR_TO_PACKET) { + if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { if (off % size != 0) { verbose("misaligned access off %d size %d\n", off, size); @@ -675,16 +691,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, } } - switch (env->prog->type) { - case BPF_PROG_TYPE_SCHED_CLS: - case BPF_PROG_TYPE_SCHED_ACT: - case BPF_PROG_TYPE_XDP: - break; - default: - verbose("verifier is misconfigured\n"); - return -EACCES; - } - if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) /* misaligned access to packet is ok on x86,arm,arm64 */ return 0; @@ -695,7 +701,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, } /* skb->data is NET_IP_ALIGN-ed */ - if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + if (reg->type == PTR_TO_PACKET && + (NET_IP_ALIGN + reg->off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", NET_IP_ALIGN, reg->off, off, size); return -EACCES; @@ -728,12 +735,52 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, if (err) return err; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_VALUE || + reg->type == PTR_TO_MAP_VALUE_ADJ) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into map\n", value_regno); return -EACCES; } + + /* If we adjusted the register to this map value at all then we + * need to change off and size to min_value and max_value + * respectively to make sure our theoretical access will be + * safe. + */ + if (reg->type == PTR_TO_MAP_VALUE_ADJ) { + if (log_level) + print_verifier_state(state); + env->varlen_map_value_access = true; + /* The minimum value is only important with signed + * comparisons where we can't assume the floor of a + * value is 0. If we are using signed variables for our + * index'es we need to make sure that whatever we use + * will have a set floor within our range. + */ + if ((s64)reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = check_map_access(env, regno, reg->min_value + off, + size); + if (err) { + verbose("R%d min value is outside of the array range\n", + regno); + return err; + } + + /* If we haven't set a max value then we need to bail + * since we can't be sure we won't do bad things. + */ + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + regno); + return -EACCES; + } + off += reg->max_value; + } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); @@ -1195,6 +1242,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() @@ -1416,6 +1464,106 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, return 0; } +static void check_reg_overflow(struct bpf_reg_state *reg) +{ + if (reg->max_value > BPF_REGISTER_MAX_RANGE) + reg->max_value = BPF_REGISTER_MAX_RANGE; + if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) + reg->min_value = BPF_REGISTER_MIN_RANGE; +} + +static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; + bool min_set = false, max_set = false; + u8 opcode = BPF_OP(insn->code); + + dst_reg = ®s[insn->dst_reg]; + if (BPF_SRC(insn->code) == BPF_X) { + check_reg_overflow(®s[insn->src_reg]); + min_val = regs[insn->src_reg].min_value; + max_val = regs[insn->src_reg].max_value; + + /* If the source register is a random pointer then the + * min_value/max_value values represent the range of the known + * accesses into that value, not the actual min/max value of the + * register itself. In this case we have to reset the reg range + * values so we know it is not safe to look at. + */ + if (regs[insn->src_reg].type != CONST_IMM && + regs[insn->src_reg].type != UNKNOWN_VALUE) { + min_val = BPF_REGISTER_MIN_RANGE; + max_val = BPF_REGISTER_MAX_RANGE; + } + } else if (insn->imm < BPF_REGISTER_MAX_RANGE && + (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { + min_val = max_val = insn->imm; + min_set = max_set = true; + } + + /* We don't know anything about what was done to this register, mark it + * as unknown. + */ + if (min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) { + reset_reg_range_values(regs, insn->dst_reg); + return; + } + + switch (opcode) { + case BPF_ADD: + dst_reg->min_value += min_val; + dst_reg->max_value += max_val; + break; + case BPF_SUB: + dst_reg->min_value -= min_val; + dst_reg->max_value -= max_val; + break; + case BPF_MUL: + dst_reg->min_value *= min_val; + dst_reg->max_value *= max_val; + break; + case BPF_AND: + /* & is special since it could end up with 0 bits set. */ + dst_reg->min_value &= min_val; + dst_reg->max_value = max_val; + break; + case BPF_LSH: + /* Gotta have special overflow logic here, if we're shifting + * more than MAX_RANGE then just assume we have an invalid + * range. + */ + if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value <<= min_val; + + if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + else + dst_reg->max_value <<= max_val; + break; + case BPF_RSH: + dst_reg->min_value >>= min_val; + dst_reg->max_value >>= max_val; + break; + case BPF_MOD: + /* % is special since it is an unsigned modulus, so the floor + * will always be 0. + */ + dst_reg->min_value = 0; + dst_reg->max_value = max_val - 1; + break; + default: + reset_reg_range_values(regs, insn->dst_reg); + break; + } + + check_reg_overflow(dst_reg); +} + /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -1479,6 +1627,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + /* we are setting our register to something new, we need to + * reset its range values. + */ + reset_reg_range_values(regs, insn->dst_reg); + if (BPF_SRC(insn->code) == BPF_X) { if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 @@ -1500,6 +1653,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = insn->imm; + regs[insn->dst_reg].max_value = insn->imm; + regs[insn->dst_reg].min_value = insn->imm; } } else if (opcode > BPF_END) { @@ -1552,6 +1707,9 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg = ®s[insn->dst_reg]; + /* first we want to adjust our ranges. */ + adjust_reg_min_max_vals(env, insn); + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { @@ -1586,8 +1744,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } - /* mark dest operand */ - mark_reg_unknown_value(regs, insn->dst_reg); + /* If we did pointer math on a map value then just set it to our + * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or + * loads to this register appropriately, otherwise just mark the + * register as unknown. + */ + if (env->allow_ptr_leaks && + (dst_reg->type == PTR_TO_MAP_VALUE || + dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) + dst_reg->type = PTR_TO_MAP_VALUE_ADJ; + else + mark_reg_unknown_value(regs, insn->dst_reg); } return 0; @@ -1642,6 +1809,104 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, } } +/* Adjusts the register min/max values in the case that the dst_reg is the + * variable register that we are working on, and src_reg is a constant or we're + * simply doing a BPF_K check. + */ +static void reg_set_min_max(struct bpf_reg_state *true_reg, + struct bpf_reg_state *false_reg, u64 val, + u8 opcode) +{ + switch (opcode) { + case BPF_JEQ: + /* If this is false then we know nothing Jon Snow, but if it is + * true then we know for sure. + */ + true_reg->max_value = true_reg->min_value = val; + break; + case BPF_JNE: + /* If this is true we know nothing Jon Snow, but if it is false + * we know the value for sure; + */ + false_reg->max_value = false_reg->min_value = val; + break; + case BPF_JGT: + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + case BPF_JSGT: + /* If this is false then we know the maximum val is val, + * otherwise we know the min val is val+1. + */ + false_reg->max_value = val; + true_reg->min_value = val + 1; + break; + case BPF_JGE: + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + case BPF_JSGE: + /* If this is false then we know the maximum value is val - 1, + * otherwise we know the mimimum value is val. + */ + false_reg->max_value = val - 1; + true_reg->min_value = val; + break; + default: + break; + } + + check_reg_overflow(false_reg); + check_reg_overflow(true_reg); +} + +/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg + * is the variable reg. + */ +static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, + struct bpf_reg_state *false_reg, u64 val, + u8 opcode) +{ + switch (opcode) { + case BPF_JEQ: + /* If this is false then we know nothing Jon Snow, but if it is + * true then we know for sure. + */ + true_reg->max_value = true_reg->min_value = val; + break; + case BPF_JNE: + /* If this is true we know nothing Jon Snow, but if it is false + * we know the value for sure; + */ + false_reg->max_value = false_reg->min_value = val; + break; + case BPF_JGT: + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + case BPF_JSGT: + /* + * If this is false, then the val is <= the register, if it is + * true the register <= to the val. + */ + false_reg->min_value = val; + true_reg->max_value = val - 1; + break; + case BPF_JGE: + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + case BPF_JSGE: + /* If this is false then constant < register, if it is true then + * the register < constant. + */ + false_reg->min_value = val + 1; + true_reg->max_value = val; + break; + default: + break; + } + + check_reg_overflow(false_reg); + check_reg_overflow(true_reg); +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -1708,6 +1973,23 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (!other_branch) return -EFAULT; + /* detect if we are comparing against a constant value so we can adjust + * our min/max values for our dst register. + */ + if (BPF_SRC(insn->code) == BPF_X) { + if (regs[insn->src_reg].type == CONST_IMM) + reg_set_min_max(&other_branch->regs[insn->dst_reg], + dst_reg, regs[insn->src_reg].imm, + opcode); + else if (dst_reg->type == CONST_IMM) + reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + ®s[insn->src_reg], dst_reg->imm, + opcode); + } else { + reg_set_min_max(&other_branch->regs[insn->dst_reg], + dst_reg, insn->imm, opcode); + } + /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && @@ -2144,7 +2426,8 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct bpf_verifier_state *old, +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { struct bpf_reg_state *rold, *rcur; @@ -2157,6 +2440,13 @@ static bool states_equal(struct bpf_verifier_state *old, if (memcmp(rold, rcur, sizeof(*rold)) == 0) continue; + /* If the ranges were not the same, but everything else was and + * we didn't do a variable access into a map then we are a-ok. + */ + if (!env->varlen_map_value_access && + rold->type == rcur->type && rold->imm == rcur->imm) + continue; + if (rold->type == NOT_INIT || (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) continue; @@ -2213,7 +2503,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(&sl->state, &env->cur_state)) + if (states_equal(env, &sl->state, &env->cur_state)) /* reached equivalent register/stack state, * prune the search */ @@ -2259,6 +2549,7 @@ static int do_check(struct bpf_verifier_env *env) init_reg_state(regs); insn_idx = 0; + env->varlen_map_value_access = false; for (;;) { struct bpf_insn *insn; u8 class; @@ -2339,6 +2630,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + reset_reg_range_values(regs, insn->dst_reg); if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; @@ -2509,6 +2801,7 @@ process_bpf_exit: verbose("invalid BPF_LD mode\n"); return -EINVAL; } + reset_reg_range_values(regs, insn->dst_reg); } else { verbose("unknown insn class %d\n", class); return -EINVAL; -- cgit v1.2.3 From d282b9c0ac9f7fb5dff5b146e37cf820c610dacc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 12 Sep 2016 14:46:23 +0100 Subject: tracing/syscalls: fix multiline in error message text pr_info message spans two lines and the literal string is missing a white space between words. Add the white space. Signed-off-by: Colin Ian King Acked-by: Ingo Molnar Acked-by: Steven Rostedt Signed-off-by: Jiri Kosina --- kernel/trace/trace_syscalls.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b2b6efc083a4..5e10395da88e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -610,8 +610,7 @@ static int perf_sysenter_enable(struct trace_event_call *call) if (!sys_perf_refcount_enter) ret = register_trace_sys_enter(perf_syscall_enter, NULL); if (ret) { - pr_info("event trace: Could not activate" - "syscall entry trace point"); + pr_info("event trace: Could not activate syscall entry trace point"); } else { set_bit(num, enabled_perf_enter_syscalls); sys_perf_refcount_enter++; @@ -682,8 +681,7 @@ static int perf_sysexit_enable(struct trace_event_call *call) if (!sys_perf_refcount_exit) ret = register_trace_sys_exit(perf_syscall_exit, NULL); if (ret) { - pr_info("event trace: Could not activate" - "syscall exit trace point"); + pr_info("event trace: Could not activate syscall exit trace point"); } else { set_bit(num, enabled_perf_exit_syscalls); sys_perf_refcount_exit++; -- cgit v1.2.3 From 679a5e3f12830392d14f94b04bbe0f3cabdbd773 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Sep 2016 11:58:36 +0200 Subject: cpuset: fix error handling regression in proc_cpuset_show() 4c737b41de7f ("cgroup: make cgroup_path() and friends behave in the style of strlcpy()") botched the conversion of proc_cpuset_show() and broke its error handling. It made the function return 0 on failures and fail to handle error returns from cgroup_path_ns(). Fix it. Reported-by: Dan Carpenter Signed-off-by: Tejun Heo --- kernel/cpuset.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 793ae6fd96dc..97dd8e178786 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2698,12 +2698,13 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, current->nsproxy->cgroup_ns); css_put(css); if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) goto out_free; seq_puts(m, buf); seq_putc(m, '\n'); @@ -2711,7 +2712,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, out_free: kfree(buf); out: - return 0; + return retval; } #endif /* CONFIG_PROC_PID_CPUSET */ -- cgit v1.2.3 From e0223003e6e141533446d01a92784592a97a8552 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Sep 2016 15:49:40 +0200 Subject: cgroup: fix error handling regressions in proc_cgroup_show() and cgroup_release_agent() 4c737b41de7f ("cgroup: make cgroup_path() and friends behave in the style of strlcpy()") broke error handling in proc_cgroup_show() and cgroup_release_agent() by not handling negative return values from cgroup_path_ns_locked(). Fix it. Reported-by: Dan Carpenter Signed-off-by: Tejun Heo --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5e2e81ad9175..a7f9fb4e1fc7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5781,10 +5781,10 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (retval >= PATH_MAX) { + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; + if (retval < 0) goto out_unlock; - } seq_puts(m, buf); } else { @@ -6069,7 +6069,7 @@ static void cgroup_release_agent(struct work_struct *work) spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (ret >= PATH_MAX) + if (ret < 0 || ret >= PATH_MAX) goto out; argv[0] = agentbuf; -- cgit v1.2.3 From 4cd13c21b207e80ddb1144c576500098f2d5f882 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Aug 2016 10:42:29 -0700 Subject: softirq: Let ksoftirqd do its job A while back, Paolo and Hannes sent an RFC patch adding threaded-able napi poll loop support : (https://patchwork.ozlabs.org/patch/620657/) The problem seems to be that softirqs are very aggressive and are often handled by the current process, even if we are under stress and that ksoftirqd was scheduled, so that innocent threads would have more chance to make progress. This patch makes sure that if ksoftirq is running, we let it perform the softirq work. Jonathan Corbet summarized the issue in https://lwn.net/Articles/687617/ Tested: - NIC receiving traffic handled by CPU 0 - UDP receiver running on CPU 0, using a single UDP socket. - Incoming flood of UDP packets targeting the UDP socket. Before the patch, the UDP receiver could almost never get CPU cycles and could only receive ~2,000 packets per second. After the patch, CPU cycles are split 50/50 between user application and ksoftirqd/0, and we can effectively read ~900,000 packets per second, a huge improvement in DOS situation. (Note that more packets are now dropped by the NIC itself, since the BH handlers get less CPU cycles to drain RX ring buffer) Since the load runs in well identified threads context, an admin can more easily tune process scheduling parameters if needed. Reported-by: Paolo Abeni Reported-by: Hannes Frederic Sowa Signed-off-by: Eric Dumazet Signed-off-by: Peter Zijlstra (Intel) Cc: David Miller Cc: Hannes Frederic Sowa Cc: Jesper Dangaard Brouer Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1472665349.14381.356.camel@edumazet-glaptop3.roam.corp.google.com Signed-off-by: Ingo Molnar --- kernel/softirq.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..8ed90e3a88d6 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -77,6 +77,17 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } +/* + * If ksoftirqd is scheduled, we do not want to process pending softirqs + * right now. Let ksoftirqd handle this at its own rate, to get fairness. + */ +static bool ksoftirqd_running(void) +{ + struct task_struct *tsk = __this_cpu_read(ksoftirqd); + + return tsk && (tsk->state == TASK_RUNNING); +} + /* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -313,7 +324,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending) + if (pending && !ksoftirqd_running()) do_softirq_own_stack(); local_irq_restore(flags); @@ -340,6 +351,9 @@ void irq_enter(void) static inline void invoke_softirq(void) { + if (ksoftirqd_running()) + return; + if (!force_irqthreads) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* -- cgit v1.2.3 From 8f37961cf22304fb286c7604d3a7f6104dcc1283 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 21 Sep 2016 12:19:03 -0700 Subject: sched/core, x86/topology: Fix NUMA in package topology bug Current code can call set_cpu_sibling_map() and invoke sched_set_topology() more than once (e.g. on CPU hot plug). When this happens after sched_init_smp() has been called, we lose the NUMA topology extension to sched_domain_topology in sched_init_numa(). This results in incorrect topology when the sched domain is rebuilt. This patch fixes the bug and issues warning if we call sched_set_topology() after sched_init_smp(). Signed-off-by: Tim Chen Signed-off-by: Srinivas Pandruvada Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Cc: jolsa@redhat.com Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1474485552-141429-2-git-send-email-srinivas.pandruvada@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8bae0cd09e9e..af5d4d7962df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6552,6 +6552,9 @@ static struct sched_domain_topology_level *sched_domain_topology = void set_sched_topology(struct sched_domain_topology_level *tl) { + if (WARN_ON_ONCE(sched_smp_initialized)) + return; + sched_domain_topology = tl; } -- cgit v1.2.3 From ab522e33f91799661aad47bebb691f241a9f6bb8 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 22 Aug 2016 15:00:41 +0100 Subject: sched/fair: Fix fixed point arithmetic width for shares and effective load Since commit: 2159197d6677 ("sched/core: Enable increased load resolution on 64-bit kernels") we now have two different fixed point units for load: - 'shares' in calc_cfs_shares() has 20 bit fixed point unit on 64-bit kernels. Therefore use scale_load() on MIN_SHARES. - 'wl' in effective_load() has 10 bit fixed point unit. Therefore use scale_load_down() on tg->shares which has 20 bit fixed point unit on 64-bit kernels. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471874441-24701-1-git-send-email-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8fb4d1942c14..786ef94197e0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5017,9 +5017,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) * wl = S * s'_i; see (2) */ if (W > 0 && w < W) - wl = (w * (long)tg->shares) / W; + wl = (w * (long)scale_load_down(tg->shares)) / W; else - wl = tg->shares; + wl = scale_load_down(tg->shares); /* * Per the above, wl is the new se->load.weight value; since -- cgit v1.2.3 From 38a3e1fc1dac480f3672ab22fc97e1f995c80ed7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Sep 2016 16:00:47 +0200 Subject: sched/wait: Fix abort_exclusive_wait(), it should pass TASK_NORMAL to wake_up() Otherwise this logic only works if mode is "compatible" with another exclusive waiter. If some wq has both TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE waiters, abort_exclusive_wait() won't wait an uninterruptible waiter. The main user is __wait_on_bit_lock() and currently it is fine but only because TASK_KILLABLE includes TASK_UNINTERRUPTIBLE and we do not have lock_page_interruptible() yet. Just use TASK_NORMAL and remove the "mode" arg from abort_exclusive_wait(). Yes, this means that (say) wake_up_interruptible() can wake up the non- interruptible waiter(s), but I think this is fine. And in fact I think that abort_exclusive_wait() must die, see the next change. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906140047.GA6157@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f15d6b6a538a..2bbba0175ab2 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -259,7 +259,6 @@ EXPORT_SYMBOL(finish_wait); * abort_exclusive_wait - abort exclusive waiting in a queue * @q: waitqueue waited on * @wait: wait descriptor - * @mode: runstate of the waiter to be woken * @key: key to identify a wait bit queue or %NULL * * Sets current thread back to running state and removes @@ -273,8 +272,7 @@ EXPORT_SYMBOL(finish_wait); * aborts and is woken up concurrently and no one wakes up * the next waiter. */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key) +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, void *key) { unsigned long flags; @@ -283,7 +281,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); + __wake_up_locked_key(q, TASK_NORMAL, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); @@ -434,7 +432,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, ret = action(&q->key, mode); if (!ret) continue; - abort_exclusive_wait(wq, &q->wait, mode, &q->key); + abort_exclusive_wait(wq, &q->wait, &q->key); return ret; } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); finish_wait(wq, &q->wait); -- cgit v1.2.3 From b1ea06a90f528e516929a4da1d9b8838752bceb9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 Sep 2016 18:48:15 +0200 Subject: sched/wait: Avoid abort_exclusive_wait() in ___wait_event() ___wait_event() doesn't really need abort_exclusive_wait(), we can simply change prepare_to_wait_event() to remove the waiter from q->task_list if it was interrupted. This simplifies the code/logic, and this way prepare_to_wait_event() can have more users, see the next change. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160908164815.GA18801@redhat.com Signed-off-by: Ingo Molnar -- include/linux/wait.h | 7 +------ kernel/sched/wait.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 16 deletions(-) --- kernel/sched/wait.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 2bbba0175ab2..261239392258 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -199,24 +199,39 @@ EXPORT_SYMBOL(prepare_to_wait_exclusive); long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; - - if (signal_pending_state(state, current)) - return -ERESTARTSYS; + long ret = 0; wait->private = current; wait->func = autoremove_wake_function; spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); - else - __add_wait_queue(q, wait); + if (unlikely(signal_pending_state(state, current))) { + /* + * Exclusive waiter must not fail if it was selected by wakeup, + * it should "consume" the condition we were waiting for. + * + * The caller will recheck the condition and return success if + * we were already woken up, we can not miss the event because + * wakeup locks/unlocks the same q->lock. + * + * But we need to ensure that set-condition + wakeup after that + * can't see us, it should wake up another exclusive waiter if + * we fail. + */ + list_del_init(&wait->task_list); + ret = -ERESTARTSYS; + } else { + if (list_empty(&wait->task_list)) { + if (wait->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_tail(q, wait); + else + __add_wait_queue(q, wait); + } + set_current_state(state); } - set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); - return 0; + return ret; } EXPORT_SYMBOL(prepare_to_wait_event); -- cgit v1.2.3 From eaf9ef52241b545fe63621266bfc6fd8b06559ff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Sep 2016 16:00:53 +0200 Subject: sched/wait: Avoid abort_exclusive_wait() in __wait_on_bit_lock() __wait_on_bit_lock() doesn't need abort_exclusive_wait() too. Right now it can't use prepare_to_wait_event() (see the next change), but it can do the additional finish_wait() if action() fails. abort_exclusive_wait() no longer has callers, remove it. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906140053.GA6164@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 64 ++++++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 261239392258..0cb615d69013 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -270,37 +270,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); -/** - * abort_exclusive_wait - abort exclusive waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * @key: key to identify a wait bit queue or %NULL - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - * - * Wakes up the next waiter if the caller is concurrently - * woken up through the queue. - * - * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and no one wakes up - * the next waiter. - */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, void *key) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - spin_lock_irqsave(&q->lock, flags); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); - else if (waitqueue_active(q)) - __wake_up_locked_key(q, TASK_NORMAL, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(abort_exclusive_wait); - int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); @@ -438,20 +407,29 @@ int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) { - do { - int ret; + int ret = 0; + for (;;) { prepare_to_wait_exclusive(wq, &q->wait, mode); - if (!test_bit(q->key.bit_nr, q->key.flags)) - continue; - ret = action(&q->key, mode); - if (!ret) - continue; - abort_exclusive_wait(wq, &q->wait, &q->key); - return ret; - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); - return 0; + if (test_bit(q->key.bit_nr, q->key.flags)) { + ret = action(&q->key, mode); + /* + * See the comment in prepare_to_wait_event(). + * finish_wait() does not necessarily takes wq->lock, + * but test_and_set_bit() implies mb() which pairs with + * smp_mb__after_atomic() before wake_up_page(). + */ + if (ret) + finish_wait(wq, &q->wait); + } + if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { + if (!ret) + finish_wait(wq, &q->wait); + return 0; + } else if (ret) { + return ret; + } + } } EXPORT_SYMBOL(__wait_on_bit_lock); -- cgit v1.2.3 From 0176beaffbe9ed627b6a4dfa61d640f1a848086f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Sep 2016 16:00:55 +0200 Subject: sched/wait: Introduce init_wait_entry() The partial initialization of wait_queue_t in prepare_to_wait_event() looks ugly. This was done to shrink .text, but we can simply add the new helper which does the full initialization and shrink the compiled code a bit more. And. This way prepare_to_wait_event() can have more users. In particular we are ready to remove the signal_pending_state() checks from wait_bit_action_f helpers and change __wait_on_bit_lock() to use prepare_to_wait_event(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906140055.GA6167@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0cb615d69013..4f7053579fe3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -196,14 +196,20 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); +void init_wait_entry(wait_queue_t *wait, int flags) +{ + wait->flags = flags; + wait->private = current; + wait->func = autoremove_wake_function; + INIT_LIST_HEAD(&wait->task_list); +} +EXPORT_SYMBOL(init_wait_entry); + long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; long ret = 0; - wait->private = current; - wait->func = autoremove_wake_function; - spin_lock_irqsave(&q->lock, flags); if (unlikely(signal_pending_state(state, current))) { /* -- cgit v1.2.3 From f39180efe5737af8467139bf8af7ca27057be87f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:37:54 +0200 Subject: sched/core: Remove unused @cpu argument from destroy_sched_domain*() Small cleanup; nothing uses the @cpu argument so make it go away. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index af5d4d7962df..b3cd5553ecbb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5952,15 +5952,15 @@ static void free_sched_domain(struct rcu_head *rcu) kfree(sd); } -static void destroy_sched_domain(struct sched_domain *sd, int cpu) +static void destroy_sched_domain(struct sched_domain *sd) { call_rcu(&sd->rcu, free_sched_domain); } -static void destroy_sched_domains(struct sched_domain *sd, int cpu) +static void destroy_sched_domains(struct sched_domain *sd) { for (; sd; sd = sd->parent) - destroy_sched_domain(sd, cpu); + destroy_sched_domain(sd); } /* @@ -6032,7 +6032,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) */ if (parent->flags & SD_PREFER_SIBLING) tmp->flags |= SD_PREFER_SIBLING; - destroy_sched_domain(parent, cpu); + destroy_sched_domain(parent); } else tmp = tmp->parent; } @@ -6040,7 +6040,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; - destroy_sched_domain(tmp, cpu); + destroy_sched_domain(tmp); if (sd) sd->child = NULL; } @@ -6050,7 +6050,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) rq_attach_root(rq, rd); tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp, cpu); + destroy_sched_domains(tmp); update_top_cache_domain(cpu); } -- cgit v1.2.3 From 16f3ef46805a5ffc75549deac2ff6af08bdf590b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:37:57 +0200 Subject: sched/core: Restructure destroy_sched_domain() There is no point in doing a call_rcu() for each domain, only do a callback for the root sched domain and clean up the entire set in one go. Also make the entire call chain be called destroy_sched_domain*() to remove confusion with the free_sched_domains() call, which does an entirely different thing. Both cpu_attach_domain() callers of destroy_sched_domain() can live without the call_rcu() because at those points the sched_domain hasn't been published yet. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b3cd5553ecbb..662d08d7b1df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5935,10 +5935,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } -static void free_sched_domain(struct rcu_head *rcu) +static void destroy_sched_domain(struct sched_domain *sd) { - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - /* * If its an overlapping domain it has private groups, iterate and * nuke them all. @@ -5952,15 +5950,21 @@ static void free_sched_domain(struct rcu_head *rcu) kfree(sd); } -static void destroy_sched_domain(struct sched_domain *sd) +static void destroy_sched_domains_rcu(struct rcu_head *rcu) { - call_rcu(&sd->rcu, free_sched_domain); + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + + while (sd) { + struct sched_domain *parent = sd->parent; + destroy_sched_domain(sd); + sd = parent; + } } static void destroy_sched_domains(struct sched_domain *sd) { - for (; sd; sd = sd->parent) - destroy_sched_domain(sd); + if (sd) + call_rcu(&sd->rcu, destroy_sched_domains_rcu); } /* -- cgit v1.2.3 From 24fc7edb92eea05946119cc0258c891c26b3b469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:37:59 +0200 Subject: sched/core: Introduce 'struct sched_domain_shared' Since struct sched_domain is strictly per cpu; introduce a structure that is shared between all 'identical' sched_domains. Limit to SD_SHARE_PKG_RESOURCES domains for now, as we'll only use it for shared cache state; if another use comes up later we can easily relax this. While the sched_group's are normally shared between CPUs, these are not natural to use when we need some shared state on a domain level -- since that would require the domain to have a parent, which is not a given. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 662d08d7b1df..97d3c18d00bf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5947,6 +5947,8 @@ static void destroy_sched_domain(struct sched_domain *sd) kfree(sd->groups->sgc); kfree(sd->groups); } + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + kfree(sd->shared); kfree(sd); } @@ -6385,6 +6387,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) + *per_cpu_ptr(sdd->sds, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -6429,10 +6434,12 @@ static int sched_domains_curr_level; static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, struct sched_domain *child, int cpu) { - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int sd_weight, sd_flags = 0; + struct sd_data *sdd = &tl->data; + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + int sd_id, sd_weight, sd_flags = 0; #ifdef CONFIG_NUMA /* @@ -6487,6 +6494,9 @@ sd_init(struct sched_domain_topology_level *tl, #endif }; + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + sd_id = cpumask_first(sched_domain_span(sd)); + /* * Convert topological properties into behaviour. */ @@ -6529,7 +6539,16 @@ sd_init(struct sched_domain_topology_level *tl, sd->idle_idx = 1; } - sd->private = &tl->data; + /* + * For all levels sharing cache; connect a sched_domain_shared + * instance. + */ + if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->shared = *per_cpu_ptr(sdd->sds, sd_id); + atomic_inc(&sd->shared->ref); + } + + sd->private = sdd; return sd; } @@ -6839,6 +6858,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; + sdd->sds = alloc_percpu(struct sched_domain_shared *); + if (!sdd->sds) + return -ENOMEM; + sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -6849,6 +6872,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; + struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -6859,6 +6883,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; + sds = kzalloc_node(sizeof(struct sched_domain_shared), + GFP_KERNEL, cpu_to_node(j)); + if (!sds) + return -ENOMEM; + + *per_cpu_ptr(sdd->sds, j) = sds; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -6898,6 +6929,8 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } + if (sdd->sds) + kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -6905,6 +6938,8 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; + free_percpu(sdd->sds); + sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); @@ -6916,9 +6951,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, child, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); -- cgit v1.2.3 From 0e369d757578b23ac50b893f920aa50fdbc45fb6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:38:01 +0200 Subject: sched/core: Replace sd_busy/nr_busy_cpus with sched_domain_shared Move the nr_busy_cpus thing from its hacky sd->parent->groups->sgc location into the much more natural sched_domain_shared location. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++----- kernel/sched/fair.c | 22 ++++++++++++---------- kernel/sched/sched.h | 6 +----- 3 files changed, 18 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d3c18d00bf..26826eac5545 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5981,14 +5981,14 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { + struct sched_domain_shared *sds = NULL; struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; int id = cpu; int size = 1; @@ -5996,13 +5996,13 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - busy_sd = sd->parent; /* sd_busy */ + sds = sd->shared; } - rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -6299,7 +6299,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) return; update_group_capacity(sd, cpu); - atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -6546,6 +6545,7 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->shared = *per_cpu_ptr(sdd->sds, sd_id); atomic_inc(&sd->shared->ref); + atomic_set(&sd->shared->nr_busy_cpus, sd_weight); } sd->private = sdd; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 786ef94197e0..15902cdb27b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8008,13 +8008,13 @@ static inline void set_cpu_sd_state_busy(void) int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgc->nr_busy_cpus); + atomic_inc(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -8025,13 +8025,13 @@ void set_cpu_sd_state_idle(void) int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgc->nr_busy_cpus); + atomic_dec(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -8258,8 +8258,8 @@ end: static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; + struct sched_domain_shared *sds; struct sched_domain *sd; - struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; bool kick = false; @@ -8287,11 +8287,13 @@ static inline bool nohz_kick_needed(struct rq *rq) return true; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { - sgc = sd->groups->sgc; - nr_busy = atomic_read(&sgc->nr_busy_cpus); - + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * XXX: write a coherent comment on why we do this. + * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { kick = true; goto unlock; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 420c05d099c3..4fc6e9876d9c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -858,8 +858,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { @@ -871,10 +871,6 @@ struct sched_group_capacity { unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - /* - * Number of busy cpus in this group. - */ - atomic_t nr_busy_cpus; unsigned long cpumask[0]; /* iteration mask */ }; -- cgit v1.2.3 From 10e2f1acd0106c05229f94c70a344ce3a2c8008b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:38:05 +0200 Subject: sched/core: Rewrite and improve select_idle_siblings() select_idle_siblings() is a known pain point for a number of workloads; it either does too much or not enough and sometimes just does plain wrong. This rewrite attempts to address a number of issues (but sadly not all). The current code does an unconditional sched_domain iteration; with the intent of finding an idle core (on SMT hardware). The problems which this patch tries to address are: - its pointless to look for idle cores if the machine is real busy; at which point you're just wasting cycles. - it's behaviour is inconsistent between SMT and !SMT hardware in that !SMT hardware ends up doing a scan for any idle CPU in the LLC domain, while SMT hardware does a scan for idle cores and if that fails, falls back to a scan for idle threads on the 'target' core. The new code replaces the sched_domain scan with 3 explicit scans: 1) search for an idle core in the LLC 2) search for an idle CPU in the LLC 3) search for an idle thread in the 'target' core where 1 and 3 are conditional on SMT support and 1 and 2 have runtime heuristics to skip the step. Step 1) is conditional on sd_llc_shared->has_idle_cores; when a cpu goes idle and sd_llc_shared->has_idle_cores is false, we scan all SMT siblings of the CPU going idle. Similarly, we clear sd_llc_shared->has_idle_cores when we fail to find an idle core. Step 2) tracks the average cost of the scan and compares this to the average idle time guestimate for the CPU doing the wakeup. There is a significant fudge factor involved to deal with the variability of the averages. Esp. hackbench was sensitive to this. Step 3) is unconditional; we assume (also per step 1) that scanning all SMT siblings in a core is 'cheap'. With this; SMT systems gain step 2, which cures a few benchmarks -- notably one from Facebook. One 'feature' of the sched_domain iteration, which we preserve in the new code, is that it would start scanning from the 'target' CPU, instead of scanning the cpumask in cpu id order. This avoids multiple CPUs in the LLC scanning for idle to gang up and find the same CPU quite as much. The down side is that tasks can end up hopping across the LLC for no apparent reason. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 + kernel/sched/fair.c | 267 +++++++++++++++++++++++++++++++++++++++-------- kernel/sched/idle_task.c | 2 +- kernel/sched/sched.h | 6 ++ 4 files changed, 231 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 26826eac5545..75ecd4f29199 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7478,6 +7478,7 @@ static struct kmem_cache *task_group_cache __read_mostly; #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { @@ -7514,6 +7515,8 @@ void __init sched_init(void) for_each_possible_cpu(i) { per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); + per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_CPUMASK_OFFSTACK */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 15902cdb27b3..6b41589c41e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1582,9 +1582,16 @@ balance: * One idle CPU per node is evaluated for a task numa move. * Call select_idle_sibling to maybe find a better one. */ - if (!cur) + if (!cur) { + /* + * select_idle_siblings() uses an per-cpu cpumask that + * can be used from IRQ context. + */ + local_irq_disable(); env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, env->dst_cpu); + local_irq_enable(); + } assign: task_numa_assign(env, cur, imp); @@ -4616,6 +4623,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* Working cpumask for: load_balance, load_balance_newidle. */ +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); + #ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. @@ -5280,65 +5292,231 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* - * Try and locate an idle CPU in the sched_domain. + * Implement a for_each_cpu() variant that starts the scan at a given cpu + * (@start), and wraps around. + * + * This is used to scan for idle CPUs; such that not all CPUs looking for an + * idle CPU find the same CPU. The down-side is that tasks tend to cycle + * through the LLC domain. + * + * Especially tbench is found sensitive to this. + */ + +static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) +{ + int next; + +again: + next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); + + if (*wrapped) { + if (next >= start) + return nr_cpumask_bits; + } else { + if (next >= nr_cpumask_bits) { + *wrapped = 1; + n = -1; + goto again; + } + } + + return next; +} + +#define for_each_cpu_wrap(cpu, mask, start, wrap) \ + for ((wrap) = 0, (cpu) = (start)-1; \ + (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ + (cpu) < nr_cpumask_bits; ) + +#ifdef CONFIG_SCHED_SMT + +static inline void set_idle_cores(int cpu, int val) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + WRITE_ONCE(sds->has_idle_cores, val); +} + +static inline bool test_idle_cores(int cpu, bool def) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + return READ_ONCE(sds->has_idle_cores); + + return def; +} + +/* + * Scans the local SMT mask to see if the entire core is idle, and records this + * information in sd_llc_shared->has_idle_cores. + * + * Since SMT siblings share all cache levels, inspecting this limited remote + * state should be fairly cheap. + */ +void update_idle_core(struct rq *rq) +{ + int core = cpu_of(rq); + int cpu; + + rcu_read_lock(); + if (test_idle_cores(core, true)) + goto unlock; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (cpu == core) + continue; + + if (!idle_cpu(cpu)) + goto unlock; + } + + set_idle_cores(core, 1); +unlock: + rcu_read_unlock(); +} + +/* + * Scan the entire LLC domain for idle cores; this dynamically switches off if + * there are no idle cores left in the system; tracked through + * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. + */ +static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int core, cpu, wrap; + + if (!test_idle_cores(target, false)) + return -1; + + cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); + + for_each_cpu_wrap(core, cpus, target, wrap) { + bool idle = true; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + cpumask_clear_cpu(cpu, cpus); + if (!idle_cpu(cpu)) + idle = false; + } + + if (idle) + return core; + } + + /* + * Failed to find an idle core; stop looking for one. + */ + set_idle_cores(target, 0); + + return -1; +} + +/* + * Scan the local SMT mask for idle CPUs. + */ +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu; + + for_each_cpu(cpu, cpu_smt_mask(target)) { + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(cpu)) + return cpu; + } + + return -1; +} + +#else /* CONFIG_SCHED_SMT */ + +static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + +#endif /* CONFIG_SCHED_SMT */ + +/* + * Scan the LLC domain for idle CPUs; this is dynamically regulated by + * comparing the average scan cost (tracked in sd->avg_scan_cost) against the + * average idle time for this rq (as found in rq->avg_idle). + */ +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + u64 avg_idle = this_rq()->avg_idle; + u64 avg_cost = this_sd->avg_scan_cost; + u64 time, cost; + s64 delta; + int cpu, wrap; + + /* + * Due to large variance we need a large fuzz factor; hackbench in + * particularly is sensitive here. + */ + if ((avg_idle / 512) < avg_cost) + return -1; + + time = local_clock(); + + for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(cpu)) + break; + } + + time = local_clock() - time; + cost = this_sd->avg_scan_cost; + delta = (s64)(time - cost) / 8; + this_sd->avg_scan_cost += delta; + + return cpu; +} + +/* + * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - struct sched_group *sg; + int i; if (idle_cpu(target)) return target; /* - * If the prevous cpu is cache affine and idle, don't be stupid. + * If the previous cpu is cache affine and idle, don't be stupid. */ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) return prev; - /* - * Otherwise, iterate the domains and find an eligible idle cpu. - * - * A completely idle sched group at higher domains is more - * desirable than an idle group at a lower level, because lower - * domains have smaller groups and usually share hardware - * resources which causes tasks to contend on them, e.g. x86 - * hyperthread siblings in the lowest domain (SMT) can contend - * on the shared cpu pipeline. - * - * However, while we prefer idle groups at higher domains - * finding an idle cpu at the lowest domain is still better than - * returning 'target', which we've already established, isn't - * idle. - */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - int i; + if (!sd) + return target; - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; + i = select_idle_core(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; - /* Ensure the entire group is idle */ - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + i = select_idle_cpu(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_smt(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; - /* - * It doesn't matter which cpu we pick, the - * whole group is idle. - */ - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); - } -done: return target; } @@ -7397,9 +7575,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512 -/* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); - static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index dedc81ecbb2e..5405d3feb112 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -27,7 +27,7 @@ static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { put_prev_task(rq, prev); - + update_idle_core(rq); schedstat_inc(rq->sched_goidle); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4fc6e9876d9c..c917dcad82ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -36,6 +36,12 @@ extern void cpu_load_update_active(struct rq *this_rq); static inline void cpu_load_update_active(struct rq *this_rq) { } #endif +#ifdef CONFIG_SCHED_SMT +extern void update_idle_core(struct rq *rq); +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + /* * Helpers for converting nanosecond timing to jiffy resolution */ -- cgit v1.2.3 From 1b568f0aabf280555125bc7cefc08321ff0ebaba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:38:41 +0200 Subject: sched/core: Optimize SCHED_SMT Avoid pointless SCHED_SMT code when running on !SMT hardware. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++++++++ kernel/sched/fair.c | 8 +++++++- kernel/sched/sched.h | 23 +++++++++++++++++------ 3 files changed, 43 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75ecd4f29199..94115453c1c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7412,6 +7412,22 @@ int sched_cpu_dying(unsigned int cpu) } #endif +#ifdef CONFIG_SCHED_SMT +DEFINE_STATIC_KEY_FALSE(sched_smt_present); + +static void sched_init_smt(void) +{ + /* + * We've enumerated all CPUs and will assume that if any CPU + * has SMT siblings, CPU0 will too. + */ + if (cpumask_weight(cpu_smt_mask(0)) > 1) + static_branch_enable(&sched_smt_present); +} +#else +static inline void sched_init_smt(void) { } +#endif + void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -7441,6 +7457,9 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + + sched_init_smt(); + sched_smp_initialized = true; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b41589c41e4..87caf2bd26f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5357,7 +5357,7 @@ static inline bool test_idle_cores(int cpu, bool def) * Since SMT siblings share all cache levels, inspecting this limited remote * state should be fairly cheap. */ -void update_idle_core(struct rq *rq) +void __update_idle_core(struct rq *rq) { int core = cpu_of(rq); int cpu; @@ -5389,6 +5389,9 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int core, cpu, wrap; + if (!static_branch_likely(&sched_smt_present)) + return -1; + if (!test_idle_cores(target, false)) return -1; @@ -5422,6 +5425,9 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t { int cpu; + if (!static_branch_likely(&sched_smt_present)) + return -1; + for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c917dcad82ad..01b5189235f2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -36,12 +36,6 @@ extern void cpu_load_update_active(struct rq *this_rq); static inline void cpu_load_update_active(struct rq *this_rq) { } #endif -#ifdef CONFIG_SCHED_SMT -extern void update_idle_core(struct rq *rq); -#else -static inline void update_idle_core(struct rq *rq) { } -#endif - /* * Helpers for converting nanosecond timing to jiffy resolution */ @@ -730,6 +724,23 @@ static inline int cpu_of(struct rq *rq) #endif } + +#ifdef CONFIG_SCHED_SMT + +extern struct static_key_false sched_smt_present; + +extern void __update_idle_core(struct rq *rq); + +static inline void update_idle_core(struct rq *rq) +{ + if (static_branch_unlikely(&sched_smt_present)) + __update_idle_core(rq); +} + +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -- cgit v1.2.3 From a399d233078edbba7cf7902a6d080100cdf75636 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 12 Sep 2016 09:47:52 +0200 Subject: sched/core: Fix incorrect utilization accounting when switching to fair class When a task switches to fair scheduling class, the period between now and the last update of its utilization is accounted as running time whatever happened during this period. This incorrect accounting applies to the task and also to the task group branch. When changing the property of a running task like its list of allowed CPUs or its scheduling class, we follow the sequence: - dequeue task - put task - change the property - set task as current task - enqueue task The end of the sequence doesn't follow the normal sequence (as per __schedule()) which is: - enqueue a task - then set the task as current task. This incorrectordering is the root cause of incorrect utilization accounting. Update the sequence to follow the right one: - dequeue task - put task - change the property - enqueue task - set task as current task Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: linaro-kernel@lists.linaro.org Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1473666472-13749-8-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94115453c1c4..2b5e150e070b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1109,10 +1109,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + p->sched_class->set_curr_task(rq); } /* @@ -3707,10 +3707,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, queue_flag); + if (running) + p->sched_class->set_curr_task(rq); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4263,8 +4263,6 @@ change: prev_class = p->sched_class; __setscheduler(rq, p, attr, pi); - if (running) - p->sched_class->set_curr_task(rq); if (queued) { /* * We enqueue to tail when the priority of a task is @@ -4275,6 +4273,8 @@ change: enqueue_task(rq, p, queue_flags); } + if (running) + p->sched_class->set_curr_task(rq); check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ @@ -5439,10 +5439,10 @@ void sched_setnuma(struct task_struct *p, int nid) p->numa_preferred_nid = nid; - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + p->sched_class->set_curr_task(rq); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -7949,10 +7949,10 @@ void sched_move_task(struct task_struct *tsk) sched_change_group(tsk, TASK_MOVE_GROUP); - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); task_rq_unlock(rq, tsk, &rf); } -- cgit v1.2.3 From a458ae2ea616420f74480f0f5ed67ca0f3b5dbf7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Sep 2016 20:29:40 +0200 Subject: sched/core, ia64: Rename set_curr_task() Rename the ia64 only set_curr_task() function to free up the name. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b5e150e070b..6ec0cfe56ddd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7818,7 +7818,7 @@ struct task_struct *curr_task(int cpu) * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! */ -void set_curr_task(int cpu, struct task_struct *p) +void ia64_set_curr_task(int cpu, struct task_struct *p) { cpu_curr(cpu) = p; } -- cgit v1.2.3 From b2bf6c314e3a9e227925240d92ecd6e9b0110170 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Sep 2016 22:00:38 +0200 Subject: sched/fair: Introduce set_curr_task() helper Now that the ia64 only set_curr_task() symbol is gone, provide a helper just like put_prev_task(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++----- kernel/sched/sched.h | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6ec0cfe56ddd..ce69fc7eaf19 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1112,7 +1112,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); if (running) - p->sched_class->set_curr_task(rq); + set_curr_task(rq, p); } /* @@ -3710,7 +3710,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (queued) enqueue_task(rq, p, queue_flag); if (running) - p->sched_class->set_curr_task(rq); + set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4274,7 +4274,7 @@ change: enqueue_task(rq, p, queue_flags); } if (running) - p->sched_class->set_curr_task(rq); + set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ @@ -5442,7 +5442,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); if (running) - p->sched_class->set_curr_task(rq); + set_curr_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -7952,7 +7952,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); + set_curr_task(rq, tsk); task_rq_unlock(rq, tsk, &rf); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 01b5189235f2..fc6ae04ec080 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1274,6 +1274,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) prev->sched_class->put_prev_task(rq, prev); } +static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +{ + curr->sched_class->set_curr_task(rq); +} + #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) -- cgit v1.2.3 From 49bd21efe7fc84f9c82c8475b8ff6f8b865b1692 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Sep 2016 22:06:01 +0200 Subject: sched/core: Fix set_user_nice() Almost all scheduler functions update state with the following pattern: if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); /* update state */ if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); if (running) set_curr_task(rq, p); set_user_nice() however misses the running part, cure this. This was found by asserting we never enqueue 'current'. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ce69fc7eaf19..aae08cedd75e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3724,7 +3724,8 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, queued; + bool queued, running; + int old_prio, delta; struct rq_flags rf; struct rq *rq; @@ -3746,8 +3747,11 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } queued = task_on_rq_queued(p); + running = task_current(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); + if (running) + put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3764,6 +3768,8 @@ void set_user_nice(struct task_struct *p, long nice) if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_curr(rq); } + if (running) + set_curr_task(rq, p); out_unlock: task_rq_unlock(rq, p, &rf); } -- cgit v1.2.3 From 9148a3a10e0b74c5722174a0bbef16d821f8a48b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Sep 2016 22:34:51 +0200 Subject: sched/debug: Add SCHED_WARN_ON() Provide SCHED_WARN_ON as wrapper for WARN_ON_ONCE() to avoid CONFIG_SCHED_DEBUG wrappery. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++----- kernel/sched/sched.h | 8 +++++++- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 87caf2bd26f0..a6789485fcae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -262,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) static inline struct task_struct *task_of(struct sched_entity *se) { -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(!entity_is_task(se)); -#endif + SCHED_WARN_ON(!entity_is_task(se)); return container_of(se, struct task_struct, se); } @@ -2369,7 +2367,7 @@ void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; - WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); work->next = work; /* protect against double add */ /* @@ -4474,7 +4472,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - WARN_ON(task_rq(p) != rq); + SCHED_WARN_ON(task_rq(p) != rq); if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fc6ae04ec080..5489d07a4643 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -15,6 +15,12 @@ #include "cpudeadline.h" #include "cpuacct.h" +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +#else +#define SCHED_WARN_ON(x) ((void)(x)) +#endif + struct rq; struct cpuidle_state; @@ -1309,7 +1315,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - WARN_ON(!rcu_read_lock_held()); + SCHED_WARN_ON(!rcu_read_lock_held()); return rq->idle_state; } #else -- cgit v1.2.3 From b60205c7c558330e4e2b5df498355ec959457358 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Sep 2016 21:58:12 +0200 Subject: sched/fair: Fix min_vruntime tracking While going through enqueue/dequeue to review the movement of set_curr_task() I noticed that the (2nd) update_min_vruntime() call in dequeue_entity() is suspect. It turns out, its actually wrong because it will consider cfs_rq->curr, which could be the entry we just normalized. This mixes different vruntime forms and leads to fail. The purpose of the second update_min_vruntime() is to move min_vruntime forward if the entity we just removed is the one that was holding it back; _except_ for the DEQUEUE_SAVE case, because then we know its a temporary removal and it will come back. However, since we do put_prev_task() _after_ dequeue(), cfs_rq->curr will still be set (and per the above, can be tranformed into a different unit), so update_min_vruntime() should also consider curr->on_rq. This also fixes another corner case where the enqueue (which also does update_curr()->update_min_vruntime()) happens on the rq->lock break in schedule(), between dequeue and put_prev_task. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: 1e876231785d ("sched: Fix ->min_vruntime calculation in dequeue_entity()") Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a6789485fcae..543b2f291152 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -460,17 +460,23 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { + struct sched_entity *curr = cfs_rq->curr; + u64 vruntime = cfs_rq->min_vruntime; - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; + if (curr) { + if (curr->on_rq) + vruntime = curr->vruntime; + else + curr = NULL; + } if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); - if (!cfs_rq->curr) + if (!curr) vruntime = se->vruntime; else vruntime = min_vruntime(vruntime, se->vruntime); @@ -3478,9 +3484,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) account_entity_dequeue(cfs_rq, se); /* - * Normalize the entity after updating the min_vruntime because the - * update can refer to the ->curr item and we need to reflect this - * movement in our normalized position. + * Normalize after update_curr(); which will also have moved + * min_vruntime if @se is the one holding it back. But before doing + * update_min_vruntime() again, which will discount @se's position and + * can move min_vruntime forward still more. */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; @@ -3488,8 +3495,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); + + /* + * Now advance min_vruntime if @se was the entity holding it back, + * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be + * put back on, and if we advance min_vruntime, we'll be placed back + * further than we started -- ie. we'll be penalized. + */ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + update_min_vruntime(cfs_rq); } /* -- cgit v1.2.3 From f9094a65755df86ec931f47b781f68ea3095cb56 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Sep 2016 02:29:17 +0200 Subject: sched/irqtime: No need for preempt-safe accessors We can safely use the preempt-unsafe accessors for irqtime when we flush its counters to kcpustat as IRQs are disabled at this time. Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Cc: Eric Dumazet Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1474849761-12678-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b93c72d5f64f..f1110760ff6c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -82,7 +82,7 @@ static cputime_t irqtime_account_hi_update(cputime_t maxtime) cputime_t irq_cputime; local_irq_save(flags); - irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - + irq_cputime = nsecs_to_cputime64(__this_cpu_read(cpu_hardirq_time)) - cpustat[CPUTIME_IRQ]; irq_cputime = min(irq_cputime, maxtime); cpustat[CPUTIME_IRQ] += irq_cputime; @@ -97,7 +97,7 @@ static cputime_t irqtime_account_si_update(cputime_t maxtime) cputime_t softirq_cputime; local_irq_save(flags); - softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - + softirq_cputime = nsecs_to_cputime64(__this_cpu_read(cpu_softirq_time)) - cpustat[CPUTIME_SOFTIRQ]; softirq_cputime = min(softirq_cputime, maxtime); cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; -- cgit v1.2.3 From 2810f611f908112ea1b30bc016d25205acb3d486 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Sep 2016 02:29:18 +0200 Subject: sched/irqtime: Remove needless IRQs disablement on kcpustat update The callers of the functions performing irqtime kcpustat updates have IRQS disabled, no need to disable them again. Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Cc: Eric Dumazet Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1474849761-12678-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f1110760ff6c..94b1a72879ec 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -78,30 +78,26 @@ EXPORT_SYMBOL_GPL(irqtime_account_irq); static cputime_t irqtime_account_hi_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; cputime_t irq_cputime; - local_irq_save(flags); irq_cputime = nsecs_to_cputime64(__this_cpu_read(cpu_hardirq_time)) - cpustat[CPUTIME_IRQ]; irq_cputime = min(irq_cputime, maxtime); cpustat[CPUTIME_IRQ] += irq_cputime; - local_irq_restore(flags); + return irq_cputime; } static cputime_t irqtime_account_si_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; cputime_t softirq_cputime; - local_irq_save(flags); softirq_cputime = nsecs_to_cputime64(__this_cpu_read(cpu_softirq_time)) - cpustat[CPUTIME_SOFTIRQ]; softirq_cputime = min(softirq_cputime, maxtime); cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; - local_irq_restore(flags); + return softirq_cputime; } @@ -295,6 +291,9 @@ static inline cputime_t account_other_time(cputime_t max) { cputime_t accounted; + /* Shall be converted to a lockdep-enabled lightweight check */ + WARN_ON_ONCE(!irqs_disabled()); + accounted = steal_account_process_time(max); if (accounted < max) -- cgit v1.2.3 From 19d23dbfeb10724675152915e76e03d771f23d9d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Sep 2016 02:29:20 +0200 Subject: sched/irqtime: Consolidate accounting synchronization with u64_stats API The irqtime accounting currently implement its own ad hoc implementation of u64_stats API. Lets rather consolidate it with the appropriate library. Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Cc: Eric Dumazet Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1474849761-12678-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 31 +++++++++++++---------------- kernel/sched/sched.h | 53 ++++++++++++++------------------------------------ 2 files changed, 29 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 94b1a72879ec..1cea2f100798 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -23,10 +23,8 @@ * task when irq is in progress while we read rq->clock. That is a worthy * compromise in place of having locks on each irq in account_system_time. */ -DEFINE_PER_CPU(u64, cpu_hardirq_time); -DEFINE_PER_CPU(u64, cpu_softirq_time); +DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -static DEFINE_PER_CPU(u64, irq_start_time); static int sched_clock_irqtime; void enable_sched_clock_irqtime(void) @@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } -#ifndef CONFIG_64BIT -DEFINE_PER_CPU(seqcount_t, irq_time_seq); -#endif /* CONFIG_64BIT */ - /* * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ void irqtime_account_irq(struct task_struct *curr) { + struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); s64 delta; int cpu; @@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr) return; cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); + delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; + irqtime->irq_start_time += delta; - irq_time_write_begin(); + u64_stats_update_begin(&irqtime->sync); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread @@ -67,11 +62,11 @@ void irqtime_account_irq(struct task_struct *curr) * that do not consume any time, but still wants to run. */ if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); + irqtime->hardirq_time += delta; else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); + irqtime->softirq_time += delta; - irq_time_write_end(); + u64_stats_update_end(&irqtime->sync); } EXPORT_SYMBOL_GPL(irqtime_account_irq); @@ -79,9 +74,10 @@ static cputime_t irqtime_account_hi_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t irq_cputime; + u64 nsecs; - irq_cputime = nsecs_to_cputime64(__this_cpu_read(cpu_hardirq_time)) - - cpustat[CPUTIME_IRQ]; + nsecs = __this_cpu_read(cpu_irqtime.hardirq_time); + irq_cputime = nsecs_to_cputime64(nsecs) - cpustat[CPUTIME_IRQ]; irq_cputime = min(irq_cputime, maxtime); cpustat[CPUTIME_IRQ] += irq_cputime; @@ -92,9 +88,10 @@ static cputime_t irqtime_account_si_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t softirq_cputime; + u64 nsecs; - softirq_cputime = nsecs_to_cputime64(__this_cpu_read(cpu_softirq_time)) - - cpustat[CPUTIME_SOFTIRQ]; + nsecs = __this_cpu_read(cpu_irqtime.softirq_time); + softirq_cputime = nsecs_to_cputime64(nsecs) - cpustat[CPUTIME_SOFTIRQ]; softirq_cputime = min(softirq_cputime, maxtime); cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5489d07a4643..19b99869809d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -1735,52 +1736,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING +struct irqtime { + u64 hardirq_time; + u64 softirq_time; + u64 irq_start_time; + struct u64_stats_sync sync; +}; -DECLARE_PER_CPU(u64, cpu_hardirq_time); -DECLARE_PER_CPU(u64, cpu_softirq_time); - -#ifndef CONFIG_64BIT -DECLARE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} +DECLARE_PER_CPU(struct irqtime, cpu_irqtime); static inline u64 irq_time_read(int cpu) { - u64 irq_time; - unsigned seq; + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); + unsigned int seq; + u64 total; do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} + seq = __u64_stats_fetch_begin(&irqtime->sync); + total = irqtime->softirq_time + irqtime->hardirq_time; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); + return total; } -#endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ -- cgit v1.2.3 From 447976ef4fd09b1be88b316d1a81553f1aa7cd07 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Sep 2016 02:29:21 +0200 Subject: sched/irqtime: Consolidate irqtime flushing code The code performing irqtime nsecs stats flushing to kcpustat is roughly the same for hardirq and softirq. So lets consolidate that common code. Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Cc: Eric Dumazet Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1474849761-12678-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 1cea2f100798..5ebee3164e64 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -70,32 +70,28 @@ void irqtime_account_irq(struct task_struct *curr) } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static cputime_t irqtime_account_hi_update(cputime_t maxtime) +static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t irq_cputime; - u64 nsecs; - nsecs = __this_cpu_read(cpu_irqtime.hardirq_time); - irq_cputime = nsecs_to_cputime64(nsecs) - cpustat[CPUTIME_IRQ]; + irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; irq_cputime = min(irq_cputime, maxtime); - cpustat[CPUTIME_IRQ] += irq_cputime; + cpustat[idx] += irq_cputime; return irq_cputime; } -static cputime_t irqtime_account_si_update(cputime_t maxtime) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { - u64 *cpustat = kcpustat_this_cpu->cpustat; - cputime_t softirq_cputime; - u64 nsecs; - - nsecs = __this_cpu_read(cpu_irqtime.softirq_time); - softirq_cputime = nsecs_to_cputime64(nsecs) - cpustat[CPUTIME_SOFTIRQ]; - softirq_cputime = min(softirq_cputime, maxtime); - cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; + return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), + CPUTIME_IRQ, maxtime); +} - return softirq_cputime; +static cputime_t irqtime_account_si_update(cputime_t maxtime) +{ + return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), + CPUTIME_SOFTIRQ, maxtime); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -- cgit v1.2.3 From d29216842a85c7970c536108e093963f02714498 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 28 Sep 2016 00:27:17 -0500 Subject: mnt: Add a per mount namespace limit on the number of mounts CAI Qian pointed out that the semantics of shared subtrees make it possible to create an exponentially increasing number of mounts in a mount namespace. mkdir /tmp/1 /tmp/2 mount --make-rshared / for i in $(seq 1 20) ; do mount --bind /tmp/1 /tmp/2 ; done Will create create 2^20 or 1048576 mounts, which is a practical problem as some people have managed to hit this by accident. As such CVE-2016-6213 was assigned. Ian Kent described the situation for autofs users as follows: > The number of mounts for direct mount maps is usually not very large because of > the way they are implemented, large direct mount maps can have performance > problems. There can be anywhere from a few (likely case a few hundred) to less > than 10000, plus mounts that have been triggered and not yet expired. > > Indirect mounts have one autofs mount at the root plus the number of mounts that > have been triggered and not yet expired. > > The number of autofs indirect map entries can range from a few to the common > case of several thousand and in rare cases up to between 30000 and 50000. I've > not heard of people with maps larger than 50000 entries. > > The larger the number of map entries the greater the possibility for a large > number of active mounts so it's not hard to expect cases of a 1000 or somewhat > more active mounts. So I am setting the default number of mounts allowed per mount namespace at 100,000. This is more than enough for any use case I know of, but small enough to quickly stop an exponential increase in mounts. Which should be perfect to catch misconfigurations and malfunctioning programs. For anyone who needs a higher limit this can be changed by writing to the new /proc/sys/fs/mount-max sysctl. Tested-by: CAI Qian Signed-off-by: "Eric W. Biederman" --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b43d0b27c1fe..03f18cc15697 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -1838,6 +1839,14 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; -- cgit v1.2.3 From be6a2e4c46cc122ba9113ba569fbc50fad075fff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 4 Oct 2016 09:55:57 +0200 Subject: Revert "sched/core: Do not use smp_processor_id() with preempt enabled in smpboot_thread_fn()" This reverts commit 4fa5cd5245b627db88c9ca08ae442373b02596b4. The original change widens a preempt-off section, to avoid a seemingly unsafe smp_processor_id() use. During review I overlooked two facts: - The code to calls a non-trivial function callback: ht->park(td->cpu); ... which might (and does occasionally) sleep, triggering the warning. - More importantly, as pointed out by Peter Zijlstra, using smp_processor_id() in that context is safe, if it's done from a kernel thread that is pinned to a single CPU - which is the case here. So revert to the original code that enables preemption sooner. Reported-by: kernel test robot Acked-by: Peter Zijlstra Cc: Con Kolivas Cc: Alfred Chen Link: http://lkml.kernel.org/r/20160930015102.GB20189@yexl-desktop Signed-off-by: Ingo Molnar --- kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index fc0d8270f69e..13bc43d1fb22 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data) if (kthread_should_park()) { __set_current_state(TASK_RUNNING); + preempt_enable(); if (ht->park && td->status == HP_THREAD_ACTIVE) { BUG_ON(td->cpu != smp_processor_id()); ht->park(td->cpu); td->status = HP_THREAD_PARKED; } - preempt_enable(); kthread_parkme(); /* We might have been woken for stop */ continue; -- cgit v1.2.3 From 58bfea9532552d422bde7afa207e1a0f08dffa7d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 4 Oct 2016 19:55:48 -0700 Subject: timekeeping: Fix __ktime_get_fast_ns() regression In commit 27727df240c7 ("Avoid taking lock in NMI path with CONFIG_DEBUG_TIMEKEEPING"), I changed the logic to open-code the timekeeping_get_ns() function, but I forgot to include the unit conversion from cycles to nanoseconds, breaking the function's output, which impacts users like perf. This results in bogus perf timestamps like: swapper 0 [000] 253.427536: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.426573: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.426687: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.426800: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.426905: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427022: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427127: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427239: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427346: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 254.427463: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 255.426572: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) Instead of more reasonable expected timestamps like: swapper 0 [000] 39.953768: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.064839: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.175956: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.287103: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.398217: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.509324: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.620437: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.731546: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.842654: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 40.953772: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) swapper 0 [000] 41.064881: 111111111 cpu-clock: ffffffff810a0de6 native_safe_halt+0x6 ([kernel.kallsyms]) Add the proper use of timekeeping_delta_to_ns() to convert the cycle delta to nanoseconds as needed. Thanks to Brendan and Alexei for finding this quickly after the v4.8 release. Unfortunately the problematic commit has landed in some -stable trees so they'll need this fix as well. Many apologies for this mistake. I'll be looking to add a perf-clock sanity test to the kselftest timers tests soon. Fixes: 27727df240c7 "timekeeping: Avoid taking lock in NMI path with CONFIG_DEBUG_TIMEKEEPING" Reported-by: Brendan Gregg Reported-by: Alexei Starovoitov Tested-and-reviewed-by: Mathieu Desnoyers Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: stable Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1475636148-26539-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e07fb093f819..37dec7e3db43 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -403,8 +403,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += clocksource_delta(tkr->read(tkr->clock), - tkr->cycle_last, tkr->mask); + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tkr->read(tkr->clock), + tkr->cycle_last, + tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); return now; -- cgit v1.2.3 From a7c2242166eb4b4447b0145705e4b66f5be89798 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Sep 2016 22:52:02 -0400 Subject: relay: simplify relay_file_read() to hell with actors... Signed-off-by: Al Viro --- kernel/relay.c | 78 +++++++++++++++------------------------------------------- 1 file changed, 20 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index d797502140b9..6fa398e90eb3 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1121,51 +1121,23 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, return end_pos; } -/* - * subbuf_read_actor - read up to one subbuf's worth of data - */ -static int subbuf_read_actor(size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc) -{ - void *from; - int ret = 0; - - from = buf->start + read_start; - ret = avail; - if (copy_to_user(desc->arg.buf, from, avail)) { - desc->error = -EFAULT; - ret = 0; - } - desc->arg.data += ret; - desc->written += ret; - desc->count -= ret; - - return ret; -} - -typedef int (*subbuf_actor_t) (size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc); - -/* - * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries - */ -static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, - subbuf_actor_t subbuf_actor, - read_descriptor_t *desc) +static ssize_t relay_file_read(struct file *filp, + char __user *buffer, + size_t count, + loff_t *ppos) { struct rchan_buf *buf = filp->private_data; size_t read_start, avail; + size_t written = 0; int ret; - if (!desc->count) + if (!count) return 0; inode_lock(file_inode(filp)); do { + void *from; + if (!relay_file_read_avail(buf, *ppos)) break; @@ -1174,32 +1146,22 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!avail) break; - avail = min(desc->count, avail); - ret = subbuf_actor(read_start, buf, avail, desc); - if (desc->error < 0) + avail = min(count, avail); + from = buf->start + read_start; + ret = avail; + if (copy_to_user(buffer, from, avail)) break; - if (ret) { - relay_file_read_consume(buf, read_start, ret); - *ppos = relay_file_read_end_pos(buf, read_start, ret); - } - } while (desc->count && ret); - inode_unlock(file_inode(filp)); + buffer += ret; + written += ret; + count -= ret; - return desc->written; -} + relay_file_read_consume(buf, read_start, ret); + *ppos = relay_file_read_end_pos(buf, read_start, ret); + } while (count); + inode_unlock(file_inode(filp)); -static ssize_t relay_file_read(struct file *filp, - char __user *buffer, - size_t count, - loff_t *ppos) -{ - read_descriptor_t desc; - desc.written = 0; - desc.count = count; - desc.arg.buf = buffer; - desc.error = 0; - return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); + return written; } static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) -- cgit v1.2.3 From 26db62f179d112d345031e14926a4cda9cd40d6e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:51 -0700 Subject: oom: keep mm of the killed task available oom_reap_task has to call exit_oom_victim in order to make sure that the oom vicim will not block the oom killer for ever. This is, however, opening new problems (e.g oom_killer_disable exclusion - see commit 74070542099c ("oom, suspend: fix oom_reaper vs. oom_killer_disable race")). exit_oom_victim should be only called from the victim's context ideally. One way to achieve this would be to rely on per mm_struct flags. We already have MMF_OOM_REAPED to hide a task from the oom killer since "mm, oom: hide mm which is shared with kthread or global init". The problem is that the exit path: do_exit exit_mm tsk->mm = NULL; mmput __mmput exit_oom_victim doesn't guarantee that exit_oom_victim will get called in a bounded amount of time. At least exit_aio depends on IO which might get blocked due to lack of memory and who knows what else is lurking there. This patch takes a different approach. We remember tsk->mm into the signal_struct and bind it to the signal struct life time for all oom victims. __oom_reap_task_mm as well as oom_scan_process_thread do not have to rely on find_lock_task_mm anymore and they will have a reliable reference to the mm struct. As a result all the oom specific communication inside the OOM killer can be done via tsk->signal->oom_mm. Increasing the signal_struct for something as unlikely as the oom killer is far from ideal but this approach will make the code much more reasonable and long term we even might want to move task->mm into the signal_struct anyway. In the next step we might want to make the oom killer exclusion and access to memory reserves completely independent which would be also nice. Link: http://lkml.kernel.org/r/1472119394-11342-4-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9a05bd93f8e7..48cafe787b75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,6 +359,8 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); + if (sig->oom_mm) + mmdrop(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } -- cgit v1.2.3 From 7283094ec3db318e87ec9e31cf75f136ac2a4dd3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:54 -0700 Subject: kernel, oom: fix potential pgd_lock deadlock from __mmdrop Lockdep complains that __mmdrop is not safe from the softirq context: ================================= [ INFO: inconsistent lock state ] 4.6.0-oomfortification2-00011-geeb3eadeab96-dirty #949 Tainted: G W --------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/1/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (pgd_lock){+.?...}, at: pgd_free+0x19/0x6b {SOFTIRQ-ON-W} state was registered at: __lock_acquire+0xa06/0x196e lock_acquire+0x139/0x1e1 _raw_spin_lock+0x32/0x41 __change_page_attr_set_clr+0x2a5/0xacd change_page_attr_set_clr+0x16f/0x32c set_memory_nx+0x37/0x3a free_init_pages+0x9e/0xc7 alternative_instructions+0xa2/0xb3 check_bugs+0xe/0x2d start_kernel+0x3ce/0x3ea x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x17a/0x18d irq event stamp: 105916 hardirqs last enabled at (105916): free_hot_cold_page+0x37e/0x390 hardirqs last disabled at (105915): free_hot_cold_page+0x2c1/0x390 softirqs last enabled at (105878): _local_bh_enable+0x42/0x44 softirqs last disabled at (105879): irq_exit+0x6f/0xd1 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(pgd_lock); lock(pgd_lock); *** DEADLOCK *** 1 lock held by swapper/1/0: #0: (rcu_callback){......}, at: rcu_process_callbacks+0x390/0x800 stack backtrace: CPU: 1 PID: 0 Comm: swapper/1 Tainted: G W 4.6.0-oomfortification2-00011-geeb3eadeab96-dirty #949 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Debian-1.8.2-1 04/01/2014 Call Trace: print_usage_bug.part.25+0x259/0x268 mark_lock+0x381/0x567 __lock_acquire+0x993/0x196e lock_acquire+0x139/0x1e1 _raw_spin_lock+0x32/0x41 pgd_free+0x19/0x6b __mmdrop+0x25/0xb9 __put_task_struct+0x103/0x11e delayed_put_task_struct+0x157/0x15e rcu_process_callbacks+0x660/0x800 __do_softirq+0x1ec/0x4d5 irq_exit+0x6f/0xd1 smp_apic_timer_interrupt+0x42/0x4d apic_timer_interrupt+0x8e/0xa0 arch_cpu_idle+0xf/0x11 default_idle_call+0x32/0x34 cpu_startup_entry+0x20c/0x399 start_secondary+0xfe/0x101 More over commit a79e53d85683 ("x86/mm: Fix pgd_lock deadlock") was explicit about pgd_lock not to be called from the irq context. This means that __mmdrop called from free_signal_struct has to be postponed to a user context. We already have a similar mechanism for mmput_async so we can use it here as well. This is safe because mm_count is pinned by mm_users. This fixes bug introduced by "oom: keep mm of the killed task available" Link: http://lkml.kernel.org/r/1472119394-11342-5-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 48cafe787b75..5650e35dda43 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,8 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); + /* + * __mmdrop is not safe to call from softirq context on x86 due to + * pgd_dtor so postpone it to the async context + */ if (sig->oom_mm) - mmdrop(sig->oom_mm); + mmdrop_async(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } -- cgit v1.2.3 From 862e3073b3eed13f17bd6be6ca6052db15c0b728 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:57 -0700 Subject: mm, oom: get rid of signal_struct::oom_victims After "oom: keep mm of the killed task available" we can safely detect an oom victim by checking task->signal->oom_mm so we do not need the signal_struct counter anymore so let's get rid of it. This alone wouldn't be sufficient for nommu archs because exit_oom_victim doesn't hide the process from the oom killer anymore. We can, however, mark the mm with a MMF flag in __mmput. We can reuse MMF_OOM_REAPED and rename it to a more generic MMF_OOM_SKIP. Link: http://lkml.kernel.org/r/1472119394-11342-6-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5650e35dda43..9a8ec66cd4df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -862,6 +862,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } -- cgit v1.2.3 From 7d2e7a22cf27e7569e6816ccc05dd74248048b30 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:00 -0700 Subject: oom, suspend: fix oom_killer_disable vs. pm suspend properly Commit 74070542099c ("oom, suspend: fix oom_reaper vs. oom_killer_disable race") has workaround an existing race between oom_killer_disable and oom_reaper by adding another round of try_to_freeze_tasks after the oom killer was disabled. This was the easiest thing to do for a late 4.7 fix. Let's fix it properly now. After "oom: keep mm of the killed task available" we no longer have to call exit_oom_victim from the oom reaper because we have stable mm available and hide the oom_reaped mm by MMF_OOM_SKIP flag. So let's remove exit_oom_victim and the race described in the above commit doesn't exist anymore if. Unfortunately this alone is not sufficient for the oom_killer_disable usecase because now we do not have any reliable way to reach exit_oom_victim (the victim might get stuck on a way to exit for an unbounded amount of time). OOM killer can cope with that by checking mm flags and move on to another victim but we cannot do the same for oom_killer_disable as we would lose the guarantee of no further interference of the victim with the rest of the system. What we can do instead is to cap the maximum time the oom_killer_disable waits for victims. The only current user of this function (pm suspend) already has a concept of timeout for back off so we can reuse the same value there. Let's drop set_freezable for the oom_reaper kthread because it is no longer needed as the reaper doesn't wake or thaw any processes. Link: http://lkml.kernel.org/r/1472119394-11342-7-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 8f27d5a8adf6..2fba066e125f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -144,23 +144,12 @@ int freeze_processes(void) /* * Now that the whole userspace is frozen we need to disbale * the OOM killer to disallow any further interference with - * killable tasks. + * killable tasks. There is no guarantee oom victims will + * ever reach a point they go away we have to wait with a timeout. */ - if (!error && !oom_killer_disable()) + if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) error = -EBUSY; - /* - * There is a hard to fix race between oom_reaper kernel thread - * and oom_killer_disable. oom_reaper calls exit_oom_victim - * before the victim reaches exit_mm so try to freeze all the tasks - * again and catch such a left over task. - */ - if (!error) { - pr_info("Double checking all user space processes after OOM killer disable... "); - error = try_to_freeze_tasks(true); - pr_cont("\n"); - } - if (error) thaw_processes(); return error; -- cgit v1.2.3 From 38531201c12144cd7d96abfdfe7449c2b01375e8 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 16:59:03 -0700 Subject: mm, oom: enforce exit_oom_victim on current task There are no users of exit_oom_victim on !current task anymore so enforce the API to always work on the current. Link: http://lkml.kernel.org/r/1472119394-11342-8-git-send-email-mhocko@kernel.org Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1e1d913914c0..9d68c45ebbe3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(tsk); + exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) -- cgit v1.2.3 From 6fcb52a56ff60d240f06296b12827e7f20d45f63 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 7 Oct 2016 17:00:08 -0700 Subject: thp: reduce usage of huge zero page's atomic counter The global zero page is used to satisfy an anonymous read fault. If THP(Transparent HugePage) is enabled then the global huge zero page is used. The global huge zero page uses an atomic counter for reference counting and is allocated/freed dynamically according to its counter value. CPU time spent on that counter will greatly increase if there are a lot of processes doing anonymous read faults. This patch proposes a way to reduce the access to the global counter so that the CPU load can be reduced accordingly. To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE. With this flag, the process only need to touch the global counter in two cases: 1 The first time it uses the global huge zero page; 2 The time when mm_user of its mm_struct reaches zero. Note that right now, the huge zero page is eligible to be freed as soon as its last use goes away. With this patch, the page will not be eligible to be freed until the exit of the last process from which it was ever used. And with the use of mm_user, the kthread is not eligible to use huge zero page either. Since no kthread is using huge zero page today, there is no difference after applying this patch. But if that is not desired, I can change it to when mm_count reaches zero. Case used for test on Haswell EP: usemem -n 72 --readonly -j 0x200000 100G Which spawns 72 processes and each will mmap 100G anonymous space and then do read only access to that space sequentially with a step of 2MB. CPU cycles from perf report for base commit: 54.03% usemem [kernel.kallsyms] [k] get_huge_zero_page CPU cycles from perf report for this commit: 0.11% usemem [kernel.kallsyms] [k] mm_get_huge_zero_page Performance(throughput) of the workload for base commit: 1784430792 Performance(throughput) of the workload for this commit: 4726928591 164% increase. Runtime of the workload for base commit: 707592 us Runtime of the workload for this commit: 303970 us 50% drop. Link: http://lkml.kernel.org/r/fe51a88f-446a-4622-1363-ad1282d71385@intel.com Signed-off-by: Aaron Lu Cc: Sergey Senozhatsky Cc: "Kirill A. Shutemov" Cc: Dave Hansen Cc: Tim Chen Cc: Huang Ying Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Ebru Akagunduz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9a8ec66cd4df..6d42242485cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -854,6 +854,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); -- cgit v1.2.3 From 6727ad9e206cc08b80d8000a4d67f8417e53539d Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Fri, 7 Oct 2016 17:02:55 -0700 Subject: nmi_backtrace: generate one-line reports for idle cpus When doing an nmi backtrace of many cores, most of which are idle, the output is a little overwhelming and very uninformative. Suppress messages for cpus that are idling when they are interrupted and just emit one line, "NMI backtrace for N skipped: idling at pc 0xNNN". We do this by grouping all the cpuidle code together into a new .cpuidle.text section, and then checking the address of the interrupted PC to see if it lies within that section. This commit suitably tags x86 and tile idle routines, and only adds in the minimal framework for other architectures. Link: http://lkml.kernel.org/r/1472487169-14923-5-git-send-email-cmetcalf@mellanox.com Signed-off-by: Chris Metcalf Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Tested-by: Daniel Thompson [arm] Tested-by: Petr Mladek Cc: Aaron Tomlin Cc: Peter Zijlstra (Intel) Cc: "Rafael J. Wysocki" Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/idle.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9fb873cfc75c..1d8718d5300d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -16,6 +16,9 @@ #include "sched.h" +/* Linker adds these: start and end of __cpuidle functions */ +extern char __cpuidle_text_start[], __cpuidle_text_end[]; + /** * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. @@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) __setup("hlt", cpu_idle_nopoll_setup); #endif -static inline int cpu_idle_poll(void) +static noinline int __cpuidle cpu_idle_poll(void) { rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); @@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void) * * To use when the cpuidle framework cannot be used. */ -void default_idle_call(void) +void __cpuidle default_idle_call(void) { if (current_clr_polling_and_test()) { local_irq_enable(); @@ -271,6 +274,12 @@ static void cpu_idle_loop(void) } } +bool cpu_in_idle(unsigned long pc) +{ + return pc >= (unsigned long)__cpuidle_text_start && + pc < (unsigned long)__cpuidle_text_end; +} + void cpu_startup_entry(enum cpuhp_state state) { /* -- cgit v1.2.3 From 81243eacfa400f5f7b89f4c2323d0de9982bb0fb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 7 Oct 2016 17:03:12 -0700 Subject: cred: simpler, 1D supplementary groups Current supplementary groups code can massively overallocate memory and is implemented in a way so that access to individual gid is done via 2D array. If number of gids is <= 32, memory allocation is more or less tolerable (140/148 bytes). But if it is not, code allocates full page (!) regardless and, what's even more fun, doesn't reuse small 32-entry array. 2D array means dependent shifts, loads and LEAs without possibility to optimize them (gid is never known at compile time). All of the above is unnecessary. Switch to the usual trailing-zero-len-array scheme. Memory is allocated with kmalloc/vmalloc() and only as much as needed. Accesses become simpler (LEA 8(gi,idx,4) or even without displacement). Maximum number of gids is 65536 which translates to 256KB+8 bytes. I think kernel can handle such allocation. On my usual desktop system with whole 9 (nine) aux groups, struct group_info shrinks from 148 bytes to 44 bytes, yay! Nice side effects: - "gi->gid[i]" is shorter than "GROUP_AT(gi, i)", less typing, - fix little mess in net/ipv4/ping.c should have been using GROUP_AT macro but this point becomes moot, - aux group allocation is persistent and should be accounted as such. Link: http://lkml.kernel.org/r/20160817201927.GA2096@p183.telecom.by Signed-off-by: Alexey Dobriyan Cc: Vasily Kulikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 67 ++++++++++++++++++--------------------------------------- kernel/uid16.c | 4 ++-- 2 files changed, 23 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index 74d431d25251..2fcadd66a8fd 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -7,55 +7,31 @@ #include #include #include +#include #include struct group_info *groups_alloc(int gidsetsize) { - struct group_info *group_info; - int nblocks; - int i; - - nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; - /* Make sure we always allocate at least one indirect block pointer */ - nblocks = nblocks ? : 1; - group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); - if (!group_info) + struct group_info *gi; + unsigned int len; + + len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; + gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); + if (!gi) + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); + if (!gi) return NULL; - group_info->ngroups = gidsetsize; - group_info->nblocks = nblocks; - atomic_set(&group_info->usage, 1); - - if (gidsetsize <= NGROUPS_SMALL) - group_info->blocks[0] = group_info->small_block; - else { - for (i = 0; i < nblocks; i++) { - kgid_t *b; - b = (void *)__get_free_page(GFP_USER); - if (!b) - goto out_undo_partial_alloc; - group_info->blocks[i] = b; - } - } - return group_info; -out_undo_partial_alloc: - while (--i >= 0) { - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); - return NULL; + atomic_set(&gi->usage, 1); + gi->ngroups = gidsetsize; + return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { - if (group_info->blocks[0] != group_info->small_block) { - int i; - for (i = 0; i < group_info->nblocks; i++) - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); + kvfree(group_info); } EXPORT_SYMBOL(groups_free); @@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist, for (i = 0; i < count; i++) { gid_t gid; - gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); + gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } @@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; } @@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info) for (base = 0; base < max; base++) { int left = base; int right = left + stride; - kgid_t tmp = GROUP_AT(group_info, right); + kgid_t tmp = group_info->gid[right]; - while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { - GROUP_AT(group_info, right) = - GROUP_AT(group_info, left); + while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { + group_info->gid[right] = group_info->gid[left]; right = left; left -= stride; } - GROUP_AT(group_info, right) = tmp; + group_info->gid[right] = tmp; } stride /= 3; } @@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - if (gid_gt(grp, GROUP_AT(group_info, mid))) + if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; - else if (gid_lt(grp, GROUP_AT(group_info, mid))) + else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; diff --git a/kernel/uid16.c b/kernel/uid16.c index d58cc4d8f0d1..cc40793464e3 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist, kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { - kgid = GROUP_AT(group_info, i); + kgid = group_info->gid[i]; group = high2lowgid(from_kgid_munged(user_ns, kgid)); if (put_user(group, grouplist+i)) return -EFAULT; @@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; -- cgit v1.2.3 From 05fd007e46296afb24d15c7d589d535e5a5b9d5c Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 7 Oct 2016 17:03:15 -0700 Subject: console: don't prefer first registered if DT specifies stdout-path If a device tree specifies a preferred device for kernel console output via the stdout-path or linux,stdout-path chosen node properties or the stdout alias then the kernel ought to honor it & output the kernel console to that device. As it stands, this isn't the case. Whilst we parse the stdout-path properties & set an of_stdout variable from of_alias_scan(), and use that from of_console_check() to determine whether to add a console device as a preferred console whilst registering it, we also prefer the first registered console if no other has been selected at the time of its registration. This means that if a console other than the one the device tree selects via stdout-path is registered first, we will switch to using it & when the stdout-path console is later registered the call to add_preferred_console() via of_console_check() is too late to do anything useful. In practice this seems to mean that we switch to the dummy console device fairly early & see no further console output: Console: colour dummy device 80x25 console [tty0] enabled bootconsole [ns16550a0] disabled Fix this by not automatically preferring the first registered console if one is specified by the device tree. This allows consoles to be registered but not enabled, and once the driver for the console selected by stdout-path calls of_console_check() the driver will be added to the list of preferred consoles before any other console has been enabled. When that console is then registered via register_console() it will be enabled as expected. Link: http://lkml.kernel.org/r/20160809151937.26118-1-paul.burton@imgtec.com Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: Paul Burton Cc: Tejun Heo Cc: Sergey Senozhatsky Cc: Jiri Slaby Cc: Daniel Vetter Cc: Ivan Delalande Cc: Thierry Reding Cc: Borislav Petkov Cc: Jan Kara Cc: Petr Mladek Cc: Joe Perches Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Frank Rowand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index eea6dbc2d8cf..8019cc0d3a73 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -253,6 +253,17 @@ static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); +#ifdef CONFIG_OF +static bool of_specified_console; + +void console_set_by_of(void) +{ + of_specified_console = true; +} +#else +# define of_specified_console false +#endif + /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -2647,7 +2658,7 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0) { + if (preferred_console < 0 && !of_specified_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || -- cgit v1.2.3 From 4bcc595ccd80decb4245096e3d1258989c50ed41 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 8 Oct 2016 20:32:40 -0700 Subject: printk: reinstate KERN_CONT for printing continuation lines Long long ago the kernel log buffer was a buffered stream of bytes, very much like stdio in user space. It supported log levels by scanning the stream and noticing the log level markers at the beginning of each line, but if you wanted to print a partial line in multiple chunks, you just did multiple printk() calls, and it just automatically worked. Except when it didn't, and you had very confusing output when different lines got all mixed up with each other. Then you got fragment lines mixing with each other, or with non-fragment lines, because it was traditionally impossible to tell whether a printk() call was a continuation or not. To at least help clarify the issue of continuation lines, we added a KERN_CONT marker back in 2007 to mark continuation lines: 474925277671 ("printk: add KERN_CONT annotation"). That continuation marker was initially an empty string, and didn't actuall make any semantic difference. But it at least made it possible to annotate the source code, and have check-patch notice that a printk() didn't need or want a log level marker, because it was a continuation of a previous line. To avoid the ambiguity between a continuation line that had that KERN_CONT marker, and a printk with no level information at all, we then in 2009 made KERN_CONT be a real log level marker which meant that we could now reliably tell the difference between the two cases. 5fd29d6ccbc9 ("printk: clean up handling of log-levels and newlines") and we could take advantage of that to make sure we didn't mix up continuation lines with lines that just didn't have any loglevel at all. Then, in 2012, the kernel log buffer was changed to be a "record" based log, where each line was a record that has a loglevel and a timestamp. You can see the beginning of that conversion in commits e11fea92e13f ("kmsg: export printk records to the /dev/kmsg interface") 7ff9554bb578 ("printk: convert byte-buffer to variable-length record buffer") with a number of follow-up commits to fix some painful fallout from that conversion. Over all, it took a couple of months to sort out most of it. But the upside was that you could have concurrent readers (and writers) of the kernel log and not have lines with mixed output in them. And one particular pain-point for the record-based kernel logging was exactly the fragmentary lines that are generated in smaller chunks. In order to still log them as one recrod, the continuation lines need to be attached to the previous record properly. However the explicit continuation record marker that is actually useful for this exact case was actually removed in aroundm the same time by commit 61e99ab8e35a ("printk: remove the now unnecessary "C" annotation for KERN_CONT") due to the incorrect belief that KERN_CONT wasn't meaningful. The ambiguity between "is this a continuation line" or "is this a plain printk with no log level information" was reintroduced, and in fact became an even bigger pain point because there was now the whole record-level merging of kernel messages going on. This patch reinstates the KERN_CONT as a real non-empty string marker, so that the ambiguity is fixed once again. But it's not a plain revert of that original removal: in the four years since we made KERN_CONT an empty string again, not only has the format of the log level markers changed, we've also had some usage changes in this area. For example, some ACPI code seems to use KERN_CONT _together_ with a log level, and now uses both the KERN_CONT marker and (for example) a KERN_INFO marker to show that it's an informational continuation of a line. Which is actually not a bad idea - if the continuation line cannot be attached to its predecessor, without the log level information we don't know what log level to assign to it (and we traditionally just assigned it the default loglevel). So having both a log level and the KERN_CONT marker is not necessarily a bad idea, but it does mean that we need to actually iterate over potentially multiple markers, rather than just a single one. Also, since KERN_CONT was still conceptually needed, and encouraged, but didn't actually _do_ anything, we've also had the reverse problem: rather than having too many annotations it has too few, and there is bit rot with code that no longer marks the continuation lines with the KERN_CONT marker. So this patch not only re-instates the non-empty KERN_CONT marker, it also fixes up the cases of bit-rot I noticed in my own logs. There are probably other cases where KERN_CONT will be needed to be added, either because it is new code that never dealt with the need for KERN_CONT, or old code that has bitrotted without anybody noticing. That said, we should strive to avoid the need for KERN_CONT. It does result in real problems for logging, and should generally not be seen as a good feature. If we some day can get rid of the feature entirely, because nobody does any fragmented printk calls, that would be lovely. But until that point, let's at mark the code that relies on the hacky multi-fragment kernel printk's. Not only does it avoid the ambiguity, it also annotates code as "maybe this would be good to fix some day". (That said, particularly during single-threaded bootup, the downsides of KERN_CONT are very limited. Things get much hairier when you have multiple threads going on and user level reading and writing logs too). Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index eea6dbc2d8cf..7b449802089a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -655,11 +655,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, * better readable output. 'c' in the record flags mark the first * fragment of a line, '+' the following. */ - if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT)) - cont = 'c'; - else if ((msg->flags & LOG_CONT) || - ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX))) - cont = '+'; + if (msg->flags & LOG_CONT) + cont = (prev_flags & LOG_CONT) ? '+' : 'c'; return scnprintf(buf, size, "%u,%llu,%llu,%c;", (msg->facility << 3) | msg->level, seq, ts_usec, cont); @@ -1819,10 +1816,9 @@ asmlinkage int vprintk_emit(int facility, int level, /* strip kernel syslog prefix and extract log level or control flags */ if (facility == 0) { - int kern_level = printk_get_level(text); + int kern_level; - if (kern_level) { - const char *end_of_header = printk_skip_level(text); + while ((kern_level = printk_get_level(text)) != 0) { switch (kern_level) { case '0' ... '7': if (level == LOGLEVEL_DEFAULT) @@ -1830,14 +1826,13 @@ asmlinkage int vprintk_emit(int facility, int level, /* fallthrough */ case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; + break; + case 'c': /* KERN_CONT */ + lflags |= LOG_CONT; } - /* - * No need to check length here because vscnprintf - * put '\0' at the end of the string. Only valid and - * newly printed level is detected. - */ - text_len -= end_of_header - text; - text = (char *)end_of_header; + + text_len -= 2; + text += 2; } } @@ -1852,7 +1847,7 @@ asmlinkage int vprintk_emit(int facility, int level, * Flush the conflicting buffer. An earlier newline was missing, * or another task also prints continuation lines. */ - if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) + if (cont.len && (!(lflags & LOG_CONT) || cont.owner != current)) cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ @@ -1874,7 +1869,7 @@ asmlinkage int vprintk_emit(int facility, int level, * a newline, flush and append the newline. */ if (cont.len) { - if (cont.owner == current && !(lflags & LOG_PREFIX)) + if (cont.owner == current && (lflags & LOG_CONT)) stored = cont_add(facility, level, text, text_len); cont_flush(LOG_NEWLINE); -- cgit v1.2.3 From c362c7ff84634390b44cc1ae7808519596de162d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 8 Oct 2016 22:02:09 -0700 Subject: printk: split out core logging code into helper function The code that actually decides how to log the message (whether to put it directly into the record log, whether to append it to an existing buffered log, or whether to start a new buffered log) is fairly non-obvious code in the middle of the vprintk_emit() function. Splitting that code up into a helper function makes it easier to understand, but perhaps more importantly also allows for the code to just return early out of the helper function once it has made the decision about where the new log content goes. Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 78 +++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7b449802089a..090e201244d8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1730,6 +1730,44 @@ static size_t cont_print_text(char *text, size_t size) return textlen; } +static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) +{ + if (!(lflags & LOG_NEWLINE)) { + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or another task also prints continuation lines. + */ + if (cont.len && (!(lflags & LOG_CONT) || cont.owner != current)) + cont_flush(LOG_NEWLINE); + + /* buffer line if possible, otherwise store it right away */ + if (cont_add(facility, level, text, text_len)) + return text_len; + + return log_store(facility, level, lflags | LOG_CONT, 0, dict, dictlen, text, text_len); + } + + /* + * If an earlier newline was missing and it was the same task, + * either merge it with the current buffer and flush, or if + * there was a race with interrupts (prefix == true) then just + * flush it out and store this line separately. + * If the preceding printk was from a different task and missed + * a newline, flush and append the newline. + */ + if (cont.len) { + bool stored = false; + + if (cont.owner == current && (lflags & LOG_CONT)) + stored = cont_add(facility, level, text, text_len); + cont_flush(LOG_NEWLINE); + if (stored) + return text_len; + } + + return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); +} + asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) @@ -1842,45 +1880,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - if (!(lflags & LOG_NEWLINE)) { - /* - * Flush the conflicting buffer. An earlier newline was missing, - * or another task also prints continuation lines. - */ - if (cont.len && (!(lflags & LOG_CONT) || cont.owner != current)) - cont_flush(LOG_NEWLINE); - - /* buffer line if possible, otherwise store it right away */ - if (cont_add(facility, level, text, text_len)) - printed_len += text_len; - else - printed_len += log_store(facility, level, - lflags | LOG_CONT, 0, - dict, dictlen, text, text_len); - } else { - bool stored = false; - - /* - * If an earlier newline was missing and it was the same task, - * either merge it with the current buffer and flush, or if - * there was a race with interrupts (prefix == true) then just - * flush it out and store this line separately. - * If the preceding printk was from a different task and missed - * a newline, flush and append the newline. - */ - if (cont.len) { - if (cont.owner == current && (lflags & LOG_CONT)) - stored = cont_add(facility, level, text, - text_len); - cont_flush(LOG_NEWLINE); - } - - if (stored) - printed_len += text_len; - else - printed_len += log_store(facility, level, lflags, 0, - dict, dictlen, text, text_len); - } + printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); -- cgit v1.2.3 From 5e467652ffefb84b1159d5d8fda665c48e5fd840 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 Oct 2016 11:53:00 -0700 Subject: printk: re-organize log_output() to be more legible Avoid some duplicate logic now that we can return early, and update the comments for the new LOG_CONT world order. This also stops the continuation flushing from just using random record flags for the flushing action, instead taking the flags from the proper original line and updating them as we add continuations to it. Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 62 ++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 090e201244d8..f6fe60d725dd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1640,35 +1640,33 @@ static struct cont { bool flushed:1; /* buffer sealed and committed */ } cont; -static void cont_flush(enum log_flags flags) +static void cont_flush(void) { if (cont.flushed) return; if (cont.len == 0) return; - if (cont.cons) { /* * If a fragment of this line was directly flushed to the * console; wait for the console to pick up the rest of the * line. LOG_NOCONS suppresses a duplicated output. */ - log_store(cont.facility, cont.level, flags | LOG_NOCONS, + log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS, cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.flags = flags; cont.flushed = true; } else { /* * If no fragment of this line ever reached the console, * just submit it to the store and free the buffer. */ - log_store(cont.facility, cont.level, flags, 0, + log_store(cont.facility, cont.level, cont.flags, 0, NULL, 0, cont.buf, cont.len); cont.len = 0; } } -static bool cont_add(int facility, int level, const char *text, size_t len) +static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) { if (cont.len && cont.flushed) return false; @@ -1679,7 +1677,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) * the line gets too long, split it up in separate records. */ if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { - cont_flush(LOG_CONT); + cont_flush(); return false; } @@ -1688,7 +1686,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) cont.level = level; cont.owner = current; cont.ts_nsec = local_clock(); - cont.flags = 0; + cont.flags = flags; cont.cons = 0; cont.flushed = false; } @@ -1696,8 +1694,15 @@ static bool cont_add(int facility, int level, const char *text, size_t len) memcpy(cont.buf + cont.len, text, len); cont.len += len; + // The original flags come from the first line, + // but later continuations can add a newline. + if (flags & LOG_NEWLINE) { + cont.flags |= LOG_NEWLINE; + cont_flush(); + } + if (cont.len > (sizeof(cont.buf) * 80) / 100) - cont_flush(LOG_CONT); + cont_flush(); return true; } @@ -1732,39 +1737,26 @@ static size_t cont_print_text(char *text, size_t size) static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { - if (!(lflags & LOG_NEWLINE)) { - /* - * Flush the conflicting buffer. An earlier newline was missing, - * or another task also prints continuation lines. - */ - if (cont.len && (!(lflags & LOG_CONT) || cont.owner != current)) - cont_flush(LOG_NEWLINE); - - /* buffer line if possible, otherwise store it right away */ - if (cont_add(facility, level, text, text_len)) - return text_len; - - return log_store(facility, level, lflags | LOG_CONT, 0, dict, dictlen, text, text_len); - } - /* - * If an earlier newline was missing and it was the same task, - * either merge it with the current buffer and flush, or if - * there was a race with interrupts (prefix == true) then just - * flush it out and store this line separately. - * If the preceding printk was from a different task and missed - * a newline, flush and append the newline. + * If an earlier line was buffered, and we're a continuation + * write from the same process, try to add it to the buffer. */ if (cont.len) { - bool stored = false; + if (cont.owner == current && (lflags & LOG_CONT)) { + if (cont_add(facility, level, lflags, text, text_len)) + return text_len; + } + /* Otherwise, make sure it's flushed */ + cont_flush(); + } - if (cont.owner == current && (lflags & LOG_CONT)) - stored = cont_add(facility, level, text, text_len); - cont_flush(LOG_NEWLINE); - if (stored) + /* If it doesn't end in a newline, try to buffer the current line */ + if (!(lflags & LOG_NEWLINE)) { + if (cont_add(facility, level, lflags, text, text_len)) return text_len; } + /* Store it in the record log */ return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); } -- cgit v1.2.3 From bfd8d3f23b51018388be0411ccbc2d56277fe294 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 Oct 2016 12:16:57 -0700 Subject: printk: make reading the kernel log flush pending lines That will mean that any possible subsequent continuation will now be broken up onto a line of its own (since reading the log has finalized the beginning og the line), but if user space has activated system logging (or if there's a kernel message dump going on) that is the right thing to do. And now that we actually get the continuation flags _right_ for this all, the user space logger that is reading the kernel messages can actually see the continuation marker. Not that anybody seems to really bother with it (or care), but in theory user space can do its own message stitching. Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f6fe60d725dd..170d3a8040f4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -783,6 +783,8 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) return ret; } +static void cont_flush(void); + static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -798,6 +800,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; raw_spin_lock_irq(&logbuf_lock); + cont_flush(); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; @@ -860,6 +863,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) return -ESPIPE; raw_spin_lock_irq(&logbuf_lock); + cont_flush(); switch (whence) { case SEEK_SET: /* the first record */ @@ -898,6 +902,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); raw_spin_lock_irq(&logbuf_lock); + cont_flush(); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) @@ -1284,6 +1289,7 @@ static int syslog_print(char __user *buf, int size) size_t skip; raw_spin_lock_irq(&logbuf_lock); + cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1343,6 +1349,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return -ENOMEM; raw_spin_lock_irq(&logbuf_lock); + cont_flush(); if (buf) { u64 next_seq; u64 seq; @@ -1504,6 +1511,7 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: raw_spin_lock_irq(&logbuf_lock); + cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -3016,6 +3024,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) dumper->active = true; raw_spin_lock_irqsave(&logbuf_lock, flags); + cont_flush(); dumper->cur_seq = clear_seq; dumper->cur_idx = clear_idx; dumper->next_seq = log_next_seq; @@ -3106,6 +3115,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, bool ret; raw_spin_lock_irqsave(&logbuf_lock, flags); + cont_flush(); ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); raw_spin_unlock_irqrestore(&logbuf_lock, flags); @@ -3148,6 +3158,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); + cont_flush(); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; -- cgit v1.2.3 From 38addce8b600ca335dc86fa3d48c890f1c6fa1f4 Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Mon, 20 Jun 2016 20:41:19 +0200 Subject: gcc-plugins: Add latent_entropy plugin This adds a new gcc plugin named "latent_entropy". It is designed to extract as much possible uncertainty from a running system at boot time as possible, hoping to capitalize on any possible variation in CPU operation (due to runtime data differences, hardware differences, SMP ordering, thermal timing variation, cache behavior, etc). At the very least, this plugin is a much more comprehensive example for how to manipulate kernel code using the gcc plugin internals. The need for very-early boot entropy tends to be very architecture or system design specific, so this plugin is more suited for those sorts of special cases. The existing kernel RNG already attempts to extract entropy from reliable runtime variation, but this plugin takes the idea to a logical extreme by permuting a global variable based on any variation in code execution (e.g. a different value (and permutation function) is used to permute the global based on loop count, case statement, if/then/else branching, etc). To do this, the plugin starts by inserting a local variable in every marked function. The plugin then adds logic so that the value of this variable is modified by randomly chosen operations (add, xor and rol) and random values (gcc generates separate static values for each location at compile time and also injects the stack pointer at runtime). The resulting value depends on the control flow path (e.g., loops and branches taken). Before the function returns, the plugin mixes this local variable into the latent_entropy global variable. The value of this global variable is added to the kernel entropy pool in do_one_initcall() and _do_fork(), though it does not credit any bytes of entropy to the pool; the contents of the global are just used to mix the pool. Additionally, the plugin can pre-initialize arrays with build-time random contents, so that two different kernel builds running on identical hardware will not have the same starting values. Signed-off-by: Emese Revfy [kees: expanded commit message and code comments] Signed-off-by: Kees Cook --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..001b18473a07 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1780,6 +1780,7 @@ long _do_fork(unsigned long clone_flags, p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + add_latent_entropy(); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. -- cgit v1.2.3 From 0766f788eb727e2e330d55d30545db65bcf2623f Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Mon, 20 Jun 2016 20:42:34 +0200 Subject: latent_entropy: Mark functions with __latent_entropy The __latent_entropy gcc attribute can be used only on functions and variables. If it is on a function then the plugin will instrument it for gathering control-flow entropy. If the attribute is on a variable then the plugin will initialize it with random contents. The variable must be an integer, an integer array type or a structure with integer fields. These specific functions have been selected because they are init functions (to help gather boot-time entropy), are called at unpredictable times, or they have variable loops, each of which provide some level of latent entropy. Signed-off-by: Emese Revfy [kees: expanded commit message] Signed-off-by: Kees Cook --- kernel/fork.c | 6 ++++-- kernel/rcu/tiny.c | 2 +- kernel/rcu/tree.c | 2 +- kernel/sched/fair.c | 2 +- kernel/softirq.c | 4 ++-- kernel/time/timer.c | 2 +- 6 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 001b18473a07..05393881ef39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -404,7 +404,8 @@ free_tsk: } #ifdef CONFIG_MMU -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; @@ -1296,7 +1297,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static struct task_struct *copy_process(unsigned long clone_flags, +static __latent_entropy struct task_struct *copy_process( + unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 944b1b491ed8..1898559e6b60 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) false)); } -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..e5164deb51e1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* * Do RCU core processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..004996df2f10 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8283,7 +8283,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */ -static void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..34033fd09c8c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -482,7 +482,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule_first); -static void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -518,7 +518,7 @@ static void tasklet_action(struct softirq_action *a) } } -static void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 32bf6f75a8fe..2d47980a1bc4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1633,7 +1633,7 @@ static inline void __run_timers(struct timer_base *base) /* * This function runs timers and the timer-tq in bottom half context. */ -static void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); -- cgit v1.2.3 From 9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 9 Oct 2016 08:04:03 +0800 Subject: sched/fair: Fix sched domains NULL dereference in select_idle_sibling() Commit: 10e2f1acd01 ("sched/core: Rewrite and improve select_idle_siblings()") ... improved select_idle_sibling(), but also triggered a regression (crash) during CPU-hotplug: BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 IP: [] select_idle_sibling+0x1c2/0x4f0 Call Trace: select_task_rq_fair+0x749/0x930 ? select_task_rq_fair+0xb4/0x930 ? __lock_is_held+0x54/0x70 try_to_wake_up+0x19a/0x5b0 default_wake_function+0x12/0x20 autoremove_wake_function+0x12/0x40 __wake_up_common+0x55/0x90 __wake_up+0x39/0x50 wake_up_klogd_work_func+0x40/0x60 irq_work_run_list+0x57/0x80 irq_work_run+0x2c/0x30 smp_irq_work_interrupt+0x2e/0x40 irq_work_interrupt+0x96/0xa0 ? _raw_spin_unlock_irqrestore+0x45/0x80 try_to_wake_up+0x4a/0x5b0 wake_up_state+0x10/0x20 __kthread_unpark+0x67/0x70 kthread_unpark+0x22/0x30 cpuhp_online_idle+0x3e/0x70 cpu_startup_entry+0x6a/0x450 start_secondary+0x154/0x180 This can be reproduced by running the ftrace test case of kselftest, the test case will hot-unplug the CPU and the CPU will attach to the NULL sched-domain during scheduler teardown. The step 2 for the rewrite select_idle_siblings(): | Step 2) tracks the average cost of the scan and compares this to the | average idle time guestimate for the CPU doing the wakeup. If the CPU which doing the wakeup is the going hot-unplug CPU, then NULL sched domain will be dereferenced to acquire the average cost of the scan. This patch fix it by failing the search of an idle CPU in the LLC process if this sched domain is NULL. Tested-by: Catalin Marinas Signed-off-by: Wanpeng Li Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1475971443-3187-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 502e95a6e927..8b03fb5d1b9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5471,13 +5471,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { - struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - u64 avg_idle = this_rq()->avg_idle; - u64 avg_cost = this_sd->avg_scan_cost; + struct sched_domain *this_sd; + u64 avg_cost, avg_idle = this_rq()->avg_idle; u64 time, cost; s64 delta; int cpu, wrap; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + + avg_cost = this_sd->avg_scan_cost; + /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. -- cgit v1.2.3 From bfd45be0b83e8f711f3abc892850d6047972d127 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Oct 2016 13:52:22 -0700 Subject: kprobes: include instead of asm-generic headers are generic implementations for architecture specific code and should not be included by common code. Thus use the asm/ version of sections.h to get at the linker sections. Link: http://lkml.kernel.org/r/1473602302-6208-1-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d10ab6b9b5e0..d63095472ea9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -49,7 +49,7 @@ #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 0a5bf409d3eefc1ca64cedf0bc1c0673164cacc1 Mon Sep 17 00:00:00 2001 From: Ales Novak Date: Tue, 11 Oct 2016 13:53:46 -0700 Subject: ptrace: clear TIF_SYSCALL_TRACE on ptrace detach On __ptrace_detach(), called from do_exit()->exit_notify()-> forget_original_parent()->exit_ptrace(), the TIF_SYSCALL_TRACE in thread->flags of the tracee is not cleared up. This results in the tracehook_report_syscall_* being called (though there's no longer a tracer listening to that) upon its further syscalls. Example scenario - attach "strace" to a running process and kill it (the strace) with SIGKILL. You'll see that the syscall trace hooks are still being called. The clearing of this flag should be moved from ptrace_detach() to __ptrace_detach(). Link: http://lkml.kernel.org/r/1472759493-20554-1-git-send-email-alnovak@suse.cz Signed-off-by: Ales Novak Acked-by: Oleg Nesterov Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1d3b7665d0be..2a99027312a6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,6 +73,8 @@ void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + child->parent = child->real_parent; list_del_init(&child->ptrace_entry); @@ -489,7 +491,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); write_lock_irq(&tasklist_lock); /* -- cgit v1.2.3 From 0ee59413c967c35a6dd2dbdab605b4cd42025ee5 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 11 Oct 2016 13:54:23 -0700 Subject: x86/panic: replace smp_send_stop() with kdump friendly version in panic path Daniel Walker reported problems which happens when crash_kexec_post_notifiers kernel option is enabled (https://lkml.org/lkml/2015/6/24/44). In that case, smp_send_stop() is called before entering kdump routines which assume other CPUs are still online. As the result, for x86, kdump routines fail to save other CPUs' registers and disable virtualization extensions. To fix this problem, call a new kdump friendly function, crash_smp_send_stop(), instead of the smp_send_stop() when crash_kexec_post_notifiers is enabled. crash_smp_send_stop() is a weak function, and it just call smp_send_stop(). Architecture codes should override it so that kdump can work appropriately. This patch only provides x86-specific version. For Xen's PV kernel, just keep the current behavior. NOTES: - Right solution would be to place crash_smp_send_stop() before __crash_kexec() invocation in all cases and remove smp_send_stop(), but we can't do that until all architectures implement own crash_smp_send_stop() - crash_smp_send_stop()-like work is still needed by machine_crash_shutdown() because crash_kexec() can be called without entering panic() Fixes: f06e5153f4ae (kernel/panic.c: add "crash_kexec_post_notifiers" option) Link: http://lkml.kernel.org/r/20160810080948.11028.15344.stgit@sysi4-13.yrl.intra.hitachi.co.jp Signed-off-by: Hidehiro Kawai Reported-by: Daniel Walker Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Eric Biederman Cc: Masami Hiramatsu Cc: Daniel Walker Cc: Xunlei Pang Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: David Vrabel Cc: Toshi Kani Cc: Ralf Baechle Cc: David Daney Cc: Aaro Koskinen Cc: "Steven J. Hill" Cc: Corey Minyard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 47 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index ca8cea1ef673..e6480e20379e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -71,6 +71,32 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) panic_smp_self_stop(); } +/* + * Stop other CPUs in panic. Architecture dependent code may override this + * with more suitable version. For example, if the architecture supports + * crash dump, it should save registers of each stopped CPU and disable + * per-CPU features such as virtualization extensions. + */ +void __weak crash_smp_send_stop(void) +{ + static int cpus_stopped; + + /* + * This function can be called twice in panic path, but obviously + * we execute this only once. + */ + if (cpus_stopped) + return; + + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a panic + * situation. + */ + smp_send_stop(); + cpus_stopped = 1; +} + atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); /* @@ -164,14 +190,21 @@ void panic(const char *fmt, ...) if (!_crash_kexec_post_notifiers) { printk_nmi_flush_on_panic(); __crash_kexec(NULL); - } - /* - * Note smp_send_stop is the usual smp shutdown function, which - * unfortunately means it may not be hardened to work in a panic - * situation. - */ - smp_send_stop(); + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a + * panic situation. + */ + smp_send_stop(); + } else { + /* + * If we want to do crash dump after notifier calls and + * kmsg_dump, we will need architecture dependent extra + * works in addition to stopping other CPUs. + */ + crash_smp_send_stop(); + } /* * Run any panic handlers, including those that might need to -- cgit v1.2.3 From 26b5679e437ef4f83db66437981c7c0d569973b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Oct 2016 13:54:33 -0700 Subject: relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra Signed-off-by: Akash Goel Cc: Tom Zanussi Cc: Chris Wilson Cc: Tvrtko Ursulin Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 9988f5cc2d46..da79a109dbeb 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -328,13 +328,15 @@ static struct rchan_callbacks default_channel_callbacks = { /** * wakeup_readers - wake up readers waiting on a channel - * @data: contains the channel buffer + * @work: contains the channel buffer * - * This is the timer function used to defer reader waking. + * This is the function used to defer reader waking */ -static void wakeup_readers(unsigned long data) +static void wakeup_readers(struct irq_work *work) { - struct rchan_buf *buf = (struct rchan_buf *)data; + struct rchan_buf *buf; + + buf = container_of(work, struct rchan_buf, wakeup_work); wake_up_interruptible(&buf->read_wait); } @@ -352,9 +354,10 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) if (init) { init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); - setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); - } else - del_timer_sync(&buf->timer); + init_irq_work(&buf->wakeup_work, wakeup_readers); + } else { + irq_work_sync(&buf->wakeup_work); + } buf->subbufs_produced = 0; buf->subbufs_consumed = 0; @@ -487,7 +490,7 @@ free_buf: static void relay_close_buf(struct rchan_buf *buf) { buf->finalized = 1; - del_timer_sync(&buf->timer); + irq_work_sync(&buf->wakeup_work); buf->chan->cb->remove_buf_file(buf->dentry); kref_put(&buf->kref, relay_remove_buf); } @@ -754,14 +757,15 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) buf->early_bytes += buf->chan->subbuf_size - buf->padding[old_subbuf]; smp_mb(); - if (waitqueue_active(&buf->read_wait)) + if (waitqueue_active(&buf->read_wait)) { /* * Calling wake_up_interruptible() from here * will deadlock if we happen to be logging * from the scheduler (trying to re-grab * rq->lock), so defer it. */ - mod_timer(&buf->timer, jiffies + 1); + irq_work_queue(&buf->wakeup_work); + } } old = buf->data; -- cgit v1.2.3 From a2c6a235dbf4318fc7f7981932478e6c47f093ab Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 11 Oct 2016 13:54:36 -0700 Subject: config/android: Remove CONFIG_IPV6_PRIVACY Option is long gone, see commit 5d9efa7ee99e ("ipv6: Remove privacy config option.") Link: http://lkml.kernel.org/r/20160811170340.9859-1-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs/android-base.config | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 9f748ed7bea8..fcf2de834ee9 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -41,7 +41,6 @@ CONFIG_IPV6=y CONFIG_IPV6_MIP6=y CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y CONFIG_IP_ADVANCED_ROUTER=y -- cgit v1.2.3 From f023a3956f273859ed36f624f75a66c272124b16 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 11 Oct 2016 13:54:38 -0700 Subject: config: android: move device mapper options to recommended CONFIG_MD is in recommended, but other dependent options like DM_CRYPT and DM_VERITY options are in base. The result is the options in base don't get enabled when applying both base and recommended fragments. Move all the options to recommended. Link: http://lkml.kernel.org/r/20160908185934.18098-1-robh@kernel.org Signed-off-by: Rob Herring Acked-by: John Stultz Cc: Amit Pundir Cc: Dmitry Shmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs/android-base.config | 4 ---- kernel/configs/android-recommended.config | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index fcf2de834ee9..da8b3e9deac6 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -11,7 +11,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y CONFIG_AUDIT=y -CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_INITRD=y CONFIG_CGROUPS=y CONFIG_CGROUP_CPUACCT=y @@ -19,9 +18,6 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_CP15_BARRIER_EMULATION=y -CONFIG_DM_CRYPT=y -CONFIG_DM_VERITY=y -CONFIG_DM_VERITY_FEC=y CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HIGH_RES_TIMERS=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index e3b953e966d2..297756be369c 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -6,12 +6,16 @@ # CONFIG_PM_WAKELOCKS_GC is not set # CONFIG_VT is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_COMPACTION=y CONFIG_DEBUG_RODATA=y +CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y CONFIG_DRAGONRISE_FF=y CONFIG_ENABLE_DEFAULT_TRACERS=y CONFIG_EXT4_FS=y -- cgit v1.2.3 From d90ae51a3e7556c9f50431db43cd8190934ccf94 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 11 Oct 2016 13:54:41 -0700 Subject: config: android: set SELinux as default security mode Android won't boot without SELinux enabled, so make it the default. Link: http://lkml.kernel.org/r/20160908185934.18098-2-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs/android-base.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index da8b3e9deac6..248070e0d61d 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -18,6 +18,7 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_DEFAULT_SECURITY_SELINUX=y CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HIGH_RES_TIMERS=y -- cgit v1.2.3 From 2489a1771aae86062c637f0eb7a06129681208b1 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 11 Oct 2016 13:54:44 -0700 Subject: config: android: enable CONFIG_SECCOMP As of Android N, SECCOMP is required. Without it, we will get mediaextractor error: E /system/bin/mediaextractor: libminijail: prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER): Invalid argument Link: http://lkml.kernel.org/r/20160908185934.18098-3-robh@kernel.org Signed-off-by: Rob Herring Acked-by: John Stultz Cc: Amit Pundir Cc: Dmitry Shmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs/android-base.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 248070e0d61d..1a8f34f63601 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -131,6 +131,7 @@ CONFIG_PREEMPT=y CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y +CONFIG_SECCOMP=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y -- cgit v1.2.3 From e700591ae03896c16974d4e1ab58eb296aaa5f59 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:17 -0700 Subject: kthread: rename probe_kthread_data() to kthread_probe_data() Patch series "kthread: Kthread worker API improvements" The intention of this patchset is to make it easier to manipulate and maintain kthreads. Especially, I want to replace all the custom main cycles with a generic one. Also I want to make the kthreads sleep in a consistent state in a common place when there is no work. This patch (of 11): A good practice is to prefix the names of functions by the name of the subsystem. This patch fixes the name of probe_kthread_data(). The other wrong functions names are part of the kthread worker API and will be fixed separately. Link: http://lkml.kernel.org/r/1470754545-17632-2-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Suggested-by: Andrew Morton Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 4 ++-- kernel/workqueue.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ab4c3766a80..7e77b728f96b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -138,7 +138,7 @@ void *kthread_data(struct task_struct *task) } /** - * probe_kthread_data - speculative version of kthread_data() + * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it @@ -146,7 +146,7 @@ void *kthread_data(struct task_struct *task) * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ -void *probe_kthread_data(struct task_struct *task) +void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = to_kthread(task); void *data = NULL; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bd81f0390277..479d840db286 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4261,7 +4261,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) * This function is called without any synchronization and @task * could be in any state. Be careful with dereferences. */ - worker = probe_kthread_data(task); + worker = kthread_probe_data(task); /* * Carefully copy the associated workqueue's workfn and name. Keep -- cgit v1.2.3 From 3989144f863ac576e6efba298d24b0b02a10d4bb Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:20 -0700 Subject: kthread: kthread worker API cleanup A good practice is to prefix the names of functions by the name of the subsystem. The kthread worker API is a mix of classic kthreads and workqueues. Each worker has a dedicated kthread. It runs a generic function that process queued works. It is implemented as part of the kthread subsystem. This patch renames the existing kthread worker API to use the corresponding name from the workqueues API prefixed by kthread_: __init_kthread_worker() -> __kthread_init_worker() init_kthread_worker() -> kthread_init_worker() init_kthread_work() -> kthread_init_work() insert_kthread_work() -> kthread_insert_work() queue_kthread_work() -> kthread_queue_work() flush_kthread_work() -> kthread_flush_work() flush_kthread_worker() -> kthread_flush_worker() Note that the names of DEFINE_KTHREAD_WORK*() macros stay as they are. It is common that the "DEFINE_" prefix has precedence over the subsystem names. Note that INIT() macros and init() functions use different naming scheme. There is no good solution. There are several reasons for this solution: + "init" in the function names stands for the verb "initialize" aka "initialize worker". While "INIT" in the macro names stands for the noun "INITIALIZER" aka "worker initializer". + INIT() macros are used only in DEFINE() macros + init() functions are used close to the other kthread() functions. It looks much better if all the functions use the same scheme. + There will be also kthread_destroy_worker() that will be used close to kthread_cancel_work(). It is related to the init() function. Again it looks better if all functions use the same naming scheme. + there are several precedents for such init() function names, e.g. amd_iommu_init_device(), free_area_init_node(), jump_label_init_type(), regmap_init_mmio_clk(), + It is not an argument but it was inconsistent even before. [arnd@arndb.de: fix linux-next merge conflict] Link: http://lkml.kernel.org/r/20160908135724.1311726-1-arnd@arndb.de Link: http://lkml.kernel.org/r/1470754545-17632-3-git-send-email-pmladek@suse.com Suggested-by: Andrew Morton Signed-off-by: Petr Mladek Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 7e77b728f96b..c52a05a8ec52 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -540,7 +540,7 @@ int kthreadd(void *unused) return 0; } -void __init_kthread_worker(struct kthread_worker *worker, +void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { @@ -549,7 +549,7 @@ void __init_kthread_worker(struct kthread_worker *worker, INIT_LIST_HEAD(&worker->work_list); worker->task = NULL; } -EXPORT_SYMBOL_GPL(__init_kthread_worker); +EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker @@ -606,7 +606,7 @@ repeat: EXPORT_SYMBOL_GPL(kthread_worker_fn); /* insert @work before @pos in @worker */ -static void insert_kthread_work(struct kthread_worker *worker, +static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { @@ -619,7 +619,7 @@ static void insert_kthread_work(struct kthread_worker *worker, } /** - * queue_kthread_work - queue a kthread_work + * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * @@ -627,7 +627,7 @@ static void insert_kthread_work(struct kthread_worker *worker, * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. */ -bool queue_kthread_work(struct kthread_worker *worker, +bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; @@ -635,13 +635,13 @@ bool queue_kthread_work(struct kthread_worker *worker, spin_lock_irqsave(&worker->lock, flags); if (list_empty(&work->node)) { - insert_kthread_work(worker, work, &worker->work_list); + kthread_insert_work(worker, work, &worker->work_list); ret = true; } spin_unlock_irqrestore(&worker->lock, flags); return ret; } -EXPORT_SYMBOL_GPL(queue_kthread_work); +EXPORT_SYMBOL_GPL(kthread_queue_work); struct kthread_flush_work { struct kthread_work work; @@ -656,12 +656,12 @@ static void kthread_flush_work_fn(struct kthread_work *work) } /** - * flush_kthread_work - flush a kthread_work + * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ -void flush_kthread_work(struct kthread_work *work) +void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), @@ -682,9 +682,10 @@ retry: } if (!list_empty(&work->node)) - insert_kthread_work(worker, &fwork.work, work->node.next); + kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) - insert_kthread_work(worker, &fwork.work, worker->work_list.next); + kthread_insert_work(worker, &fwork.work, + worker->work_list.next); else noop = true; @@ -693,23 +694,23 @@ retry: if (!noop) wait_for_completion(&fwork.done); } -EXPORT_SYMBOL_GPL(flush_kthread_work); +EXPORT_SYMBOL_GPL(kthread_flush_work); /** - * flush_kthread_worker - flush all current works on a kthread_worker + * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ -void flush_kthread_worker(struct kthread_worker *worker) +void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; - queue_kthread_work(worker, &fwork.work); + kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } -EXPORT_SYMBOL_GPL(flush_kthread_worker); +EXPORT_SYMBOL_GPL(kthread_flush_worker); -- cgit v1.2.3 From a65d40961dc70d15dec046c2ee88c556f57940b2 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:23 -0700 Subject: kthread/smpboot: do not park in kthread_create_on_cpu() kthread_create_on_cpu() was added by the commit 2a1d446019f9a5983e ("kthread: Implement park/unpark facility"). It is currently used only when enabling new CPU. For this purpose, the newly created kthread has to be parked. The CPU binding is a bit tricky. The kthread is parked when the CPU has not been allowed yet. And the CPU is bound when the kthread is unparked. The function would be useful for more per-CPU kthreads, e.g. bnx2fc_thread, fcoethread. For this purpose, the newly created kthread should stay in the uninterruptible state. This patch moves the parking into smpboot. It binds the thread already when created. Then the function might be used universally. Also the behavior is consistent with kthread_create() and kthread_create_on_node(). Link: http://lkml.kernel.org/r/1470754545-17632-4-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 8 ++++++-- kernel/smpboot.c | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index c52a05a8ec52..f5dc0b151d61 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -390,10 +390,10 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), cpu); if (IS_ERR(p)) return p; + kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); to_kthread(p)->cpu = cpu; - /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ - kthread_park(p); return p; } @@ -407,6 +407,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) * which might be about to be cleared. */ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + /* + * Newly created kthread was parked when the CPU was offline. + * The binding was lost and we need to set it again. + */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); wake_up_state(k, TASK_PARKED); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 13bc43d1fb22..4a5c6e73ecd4 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -186,6 +186,11 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kfree(td); return PTR_ERR(tsk); } + /* + * Park the thread so that it could start right on the CPU + * when it is available. + */ + kthread_park(tsk); get_task_struct(tsk); *per_cpu_ptr(ht->store, cpu) = tsk; if (ht->create) { -- cgit v1.2.3 From 255451e45345bcd19e79f25e9afc0b9e14ccb104 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:27 -0700 Subject: kthread: allow to call __kthread_create_on_node() with va_list args kthread_create_on_node() implements a bunch of logic to create the kthread. It is already called by kthread_create_on_cpu(). We are going to extend the kthread worker API and will need to call kthread_create_on_node() with va_list args there. This patch does only a refactoring and does not modify the existing behavior. Link: http://lkml.kernel.org/r/1470754545-17632-5-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 72 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index f5dc0b151d61..36fd751a769f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -244,33 +244,10 @@ static void create_kthread(struct kthread_create_info *create) } } -/** - * kthread_create_on_node - create a kthread. - * @threadfn: the function to run until signal_pending(current). - * @data: data ptr for @threadfn. - * @node: task and thread structures for the thread are allocated on this node - * @namefmt: printf-style name for the thread. - * - * Description: This helper function creates and names a kernel - * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and - * is affine to all CPUs. - * - * If thread is going to be bound on a particular cpu, give its node - * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. - * When woken, the thread will run @threadfn() with @data as its - * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which no one will call kthread_stop(), or - * return when 'kthread_should_stop()' is true (which means - * kthread_stop() has been called). The return value should be zero - * or a negative error number; it will be passed to kthread_stop(). - * - * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). - */ -struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), - void *data, int node, - const char namefmt[], - ...) +static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + void *data, int node, + const char namefmt[], + va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; @@ -311,11 +288,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; - va_list args; - va_start(args, namefmt); vsnprintf(task->comm, sizeof(task->comm), namefmt, args); - va_end(args); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. @@ -326,6 +300,44 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), kfree(create); return task; } + +/** + * kthread_create_on_node - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @node: task and thread structures for the thread are allocated on this node + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and + * is affine to all CPUs. + * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn() can either call do_exit() directly if it is a + * standalone thread for which no one will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). + */ +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, int node, + const char namefmt[], + ...) +{ + struct task_struct *task; + va_list args; + + va_start(args, namefmt); + task = __kthread_create_on_node(threadfn, data, node, namefmt, args); + va_end(args); + + return task; +} EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) -- cgit v1.2.3 From fbae2d44aa1df72d0154be77eb4d71e1e34c0f8f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:30 -0700 Subject: kthread: add kthread_create_worker*() Kthread workers are currently created using the classic kthread API, namely kthread_run(). kthread_worker_fn() is passed as the @threadfn parameter. This patch defines kthread_create_worker() and kthread_create_worker_on_cpu() functions that hide implementation details. They enforce using kthread_worker_fn() for the main thread. But I doubt that there are any plans to create any alternative. In fact, I think that we do not want any alternative main thread because it would be hard to support consistency with the rest of the kthread worker API. The naming and function of kthread_create_worker() is inspired by the workqueues API like the rest of the kthread worker API. The kthread_create_worker_on_cpu() variant is motivated by the original kthread_create_on_cpu(). Note that we need to bind per-CPU kthread workers already when they are created. It makes the life easier. kthread_bind() could not be used later for an already running worker. This patch does _not_ convert existing kthread workers. The kthread worker API need more improvements first, e.g. a function to destroy the worker. IMPORTANT: kthread_create_worker_on_cpu() allows to use any format of the worker name, in compare with kthread_create_on_cpu(). The good thing is that it is more generic. The bad thing is that most users will need to pass the cpu number in two parameters, e.g. kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). To be honest, the main motivation was to avoid the need for an empty va_list. The only legal way was to create a helper function that would be called with an empty list. Other attempts caused compilation warnings or even errors on different architectures. There were also other alternatives, for example, using #define or splitting __kthread_create_worker(). The used solution looked like the least ugly. Link: http://lkml.kernel.org/r/1470754545-17632-6-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 103 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 36fd751a769f..12695e68ed00 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -571,23 +571,24 @@ EXPORT_SYMBOL_GPL(__kthread_init_worker); * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * - * This function can be used as @threadfn to kthread_create() or - * kthread_run() with @worker_ptr argument pointing to an initialized - * kthread_worker. The started kthread will process work_list until - * the it is stopped with kthread_stop(). A kthread can also call - * this function directly after extra initialization. + * This function implements the main cycle of kthread worker. It processes + * work_list until it is stopped with kthread_stop(). It sleeps when the queue + * is empty. * - * Different kthreads can be used for the same kthread_worker as long - * as there's only one kthread attached to it at any given time. A - * kthread_worker without an attached kthread simply collects queued - * kthread_works. + * The works are not allowed to keep any locks, disable preemption or interrupts + * when they finish. There is defined a safe point for freezing when one work + * finishes and before a new one is started. */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; - WARN_ON(worker->task); + /* + * FIXME: Update the check and remove the assignment when all kthread + * worker users are created using kthread_create_worker*() functions. + */ + WARN_ON(worker->task && worker->task != current); worker->task = current; repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ @@ -621,6 +622,98 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); +static struct kthread_worker * +__kthread_create_worker(int cpu, const char namefmt[], va_list args) +{ + struct kthread_worker *worker; + struct task_struct *task; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL); + if (!worker) + return ERR_PTR(-ENOMEM); + + kthread_init_worker(worker); + + if (cpu >= 0) { + char name[TASK_COMM_LEN]; + + /* + * kthread_create_worker_on_cpu() allows to pass a generic + * namefmt in compare with kthread_create_on_cpu. We need + * to format it here. + */ + vsnprintf(name, sizeof(name), namefmt, args); + task = kthread_create_on_cpu(kthread_worker_fn, worker, + cpu, name); + } else { + task = __kthread_create_on_node(kthread_worker_fn, worker, + -1, namefmt, args); + } + + if (IS_ERR(task)) + goto fail_task; + + worker->task = task; + wake_up_process(task); + return worker; + +fail_task: + kfree(worker); + return ERR_CAST(task); +} + +/** + * kthread_create_worker - create a kthread worker + * @namefmt: printf-style name for the kthread worker (task). + * + * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) + * when the needed structures could not get allocated, and ERR_PTR(-EINTR) + * when the worker was SIGKILLed. + */ +struct kthread_worker * +kthread_create_worker(const char namefmt[], ...) +{ + struct kthread_worker *worker; + va_list args; + + va_start(args, namefmt); + worker = __kthread_create_worker(-1, namefmt, args); + va_end(args); + + return worker; +} +EXPORT_SYMBOL(kthread_create_worker); + +/** + * kthread_create_worker_on_cpu - create a kthread worker and bind it + * it to a given CPU and the associated NUMA node. + * @cpu: CPU number + * @namefmt: printf-style name for the kthread worker (task). + * + * Use a valid CPU number if you want to bind the kthread worker + * to the given CPU and the associated NUMA node. + * + * A good practice is to add the cpu number also into the worker name. + * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). + * + * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) + * when the needed structures could not get allocated, and ERR_PTR(-EINTR) + * when the worker was SIGKILLed. + */ +struct kthread_worker * +kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...) +{ + struct kthread_worker *worker; + va_list args; + + va_start(args, namefmt); + worker = __kthread_create_worker(cpu, namefmt, args); + va_end(args); + + return worker; +} +EXPORT_SYMBOL(kthread_create_worker_on_cpu); + /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, -- cgit v1.2.3 From 35033fe9cbbf18415dfeb7e27f0d4228dfc7458a Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:33 -0700 Subject: kthread: add kthread_destroy_worker() The current kthread worker users call flush() and stop() explicitly. This function does the same plus it frees the kthread_worker struct in one call. It is supposed to be used together with kthread_create_worker*() that allocates struct kthread_worker. Link: http://lkml.kernel.org/r/1470754545-17632-7-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 12695e68ed00..f874300dfce5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -823,3 +823,26 @@ void kthread_flush_worker(struct kthread_worker *worker) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); + +/** + * kthread_destroy_worker - destroy a kthread worker + * @worker: worker to be destroyed + * + * Flush and destroy @worker. The simple flush is enough because the kthread + * worker API is used only in trivial scenarios. There are no multi-step state + * machines needed. + */ +void kthread_destroy_worker(struct kthread_worker *worker) +{ + struct task_struct *task; + + task = worker->task; + if (WARN_ON(!task)) + return; + + kthread_flush_worker(worker); + kthread_stop(task); + WARN_ON(!list_empty(&worker->work_list)); + kfree(worker); +} +EXPORT_SYMBOL(kthread_destroy_worker); -- cgit v1.2.3 From 8197b3d43b250b8214b9913878e9227eef8bd85f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:36 -0700 Subject: kthread: detect when a kthread work is used by more workers Nothing currently prevents a work from queuing for a kthread worker when it is already running on another one. This means that the work might run in parallel on more than one worker. Also some operations are not reliable, e.g. flush. This problem will be even more visible after we add kthread_cancel_work() function. It will only have "work" as the parameter and will use worker->lock to synchronize with others. Well, normally this is not a problem because the API users are sane. But bugs might happen and users also might be crazy. This patch adds a warning when we try to insert the work for another worker. It does not fully prevent the misuse because it would make the code much more complicated without a big benefit. It adds the same warning also into kthread_flush_work() instead of the repeated attempts to get the right lock. A side effect is that one needs to explicitly reinitialize the work if it must be queued into another worker. This is needed, for example, when the worker is stopped and started again. It is a bit inconvenient. But it looks like a good compromise between the stability and complexity. I have double checked all existing users of the kthread worker API and they all seems to initialize the work after the worker gets started. Just for completeness, the patch adds a check that the work is not already in a queue. The patch also puts all the checks into a separate function. It will be reused when implementing delayed works. Link: http://lkml.kernel.org/r/1470754545-17632-8-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index f874300dfce5..524dfc8247d7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -578,6 +578,9 @@ EXPORT_SYMBOL_GPL(__kthread_init_worker); * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. + * + * Also the works must not be handled by more than one worker at the same time, + * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { @@ -714,12 +717,21 @@ kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...) } EXPORT_SYMBOL(kthread_create_worker_on_cpu); +static void kthread_insert_work_sanity_check(struct kthread_worker *worker, + struct kthread_work *work) +{ + lockdep_assert_held(&worker->lock); + WARN_ON_ONCE(!list_empty(&work->node)); + /* Do not use a work with >1 worker, see kthread_queue_work() */ + WARN_ON_ONCE(work->worker && work->worker != worker); +} + /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, - struct kthread_work *work, - struct list_head *pos) + struct kthread_work *work, + struct list_head *pos) { - lockdep_assert_held(&worker->lock); + kthread_insert_work_sanity_check(worker, work); list_add_tail(&work->node, pos); work->worker = worker; @@ -735,6 +747,9 @@ static void kthread_insert_work(struct kthread_worker *worker, * Queue @work to work processor @task for async execution. @task * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. + * + * Reinitialize the work if it needs to be used by another worker. + * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) @@ -779,16 +794,13 @@ void kthread_flush_work(struct kthread_work *work) struct kthread_worker *worker; bool noop = false; -retry: worker = work->worker; if (!worker) return; spin_lock_irq(&worker->lock); - if (work->worker != worker) { - spin_unlock_irq(&worker->lock); - goto retry; - } + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); -- cgit v1.2.3 From 22597dc3d97b1ead2aca201397415a1a84bf2b26 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:40 -0700 Subject: kthread: initial support for delayed kthread work We are going to use kthread_worker more widely and delayed works will be pretty useful. The implementation is inspired by workqueues. It uses a timer to queue the work after the requested delay. If the delay is zero, the work is queued immediately. In compare with workqueues, each work is associated with a single worker (kthread). Therefore the implementation could be much easier. In particular, we use the worker->lock to synchronize all the operations with the work. We do not need any atomic operation with a flags variable. In fact, we do not need any state variable at all. Instead, we add a list of delayed works into the worker. Then the pending work is listed either in the list of queued or delayed works. And the existing check of pending works is the same even for the delayed ones. A work must not be assigned to another worker unless reinitialized. Therefore the timer handler might expect that dwork->work->worker is valid and it could simply take the lock. We just add some sanity checks to help with debugging a potential misuse. Link: http://lkml.kernel.org/r/1470754545-17632-9-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 524dfc8247d7..a5c6d709349d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -563,6 +563,7 @@ void __kthread_init_worker(struct kthread_worker *worker, spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); + INIT_LIST_HEAD(&worker->delayed_work_list); worker->task = NULL; } EXPORT_SYMBOL_GPL(__kthread_init_worker); @@ -767,6 +768,107 @@ bool kthread_queue_work(struct kthread_worker *worker, } EXPORT_SYMBOL_GPL(kthread_queue_work); +/** + * kthread_delayed_work_timer_fn - callback that queues the associated kthread + * delayed work when the timer expires. + * @__data: pointer to the data associated with the timer + * + * The format of the function is defined by struct timer_list. + * It should have been called from irqsafe timer with irq already off. + */ +void kthread_delayed_work_timer_fn(unsigned long __data) +{ + struct kthread_delayed_work *dwork = + (struct kthread_delayed_work *)__data; + struct kthread_work *work = &dwork->work; + struct kthread_worker *worker = work->worker; + + /* + * This might happen when a pending work is reinitialized. + * It means that it is used a wrong way. + */ + if (WARN_ON_ONCE(!worker)) + return; + + spin_lock(&worker->lock); + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + + /* Move the work from worker->delayed_work_list. */ + WARN_ON_ONCE(list_empty(&work->node)); + list_del_init(&work->node); + kthread_insert_work(worker, work, &worker->work_list); + + spin_unlock(&worker->lock); +} +EXPORT_SYMBOL(kthread_delayed_work_timer_fn); + +void __kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) +{ + struct timer_list *timer = &dwork->timer; + struct kthread_work *work = &dwork->work; + + WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn || + timer->data != (unsigned long)dwork); + + /* + * If @delay is 0, queue @dwork->work immediately. This is for + * both optimization and correctness. The earliest @timer can + * expire is on the closest next tick and delayed_work users depend + * on that there's no such delay when @delay is 0. + */ + if (!delay) { + kthread_insert_work(worker, work, &worker->work_list); + return; + } + + /* Be paranoid and try to detect possible races already now. */ + kthread_insert_work_sanity_check(worker, work); + + list_add(&work->node, &worker->delayed_work_list); + work->worker = worker; + timer_stats_timer_set_start_info(&dwork->timer); + timer->expires = jiffies + delay; + add_timer(timer); +} + +/** + * kthread_queue_delayed_work - queue the associated kthread work + * after a delay. + * @worker: target kthread_worker + * @dwork: kthread_delayed_work to queue + * @delay: number of jiffies to wait before queuing + * + * If the work has not been pending it starts a timer that will queue + * the work after the given @delay. If @delay is zero, it queues the + * work immediately. + * + * Return: %false if the @work has already been pending. It means that + * either the timer was running or the work was queued. It returns %true + * otherwise. + */ +bool kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) +{ + struct kthread_work *work = &dwork->work; + unsigned long flags; + bool ret = false; + + spin_lock_irqsave(&worker->lock, flags); + + if (list_empty(&work->node)) { + __kthread_queue_delayed_work(worker, dwork, delay); + ret = true; + } + + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); + struct kthread_flush_work { struct kthread_work work; struct completion done; -- cgit v1.2.3 From 37be45d49dec2a411e29d50c9597cfe8184b5645 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:43 -0700 Subject: kthread: allow to cancel kthread work We are going to use kthread workers more widely and sometimes we will need to make sure that the work is neither pending nor running. This patch implements cancel_*_sync() operations as inspired by workqueues. Well, we are synchronized against the other operations via the worker lock, we use del_timer_sync() and a counter to count parallel cancel operations. Therefore the implementation might be easier. First, we check if a worker is assigned. If not, the work has newer been queued after it was initialized. Second, we take the worker lock. It must be the right one. The work must not be assigned to another worker unless it is initialized in between. Third, we try to cancel the timer when it exists. The timer is deleted synchronously to make sure that the timer call back is not running. We need to temporary release the worker->lock to avoid a possible deadlock with the callback. In the meantime, we set work->canceling counter to avoid any queuing. Fourth, we try to remove the work from a worker list. It might be the list of either normal or delayed works. Fifth, if the work is running, we call kthread_flush_work(). It might take an arbitrary time. We need to release the worker-lock again. In the meantime, we again block any queuing by the canceling counter. As already mentioned, the check for a pending kthread work is done under a lock. In compare with workqueues, we do not need to fight for a single PENDING bit to block other operations. Therefore we do not suffer from the thundering storm problem and all parallel canceling jobs might use kthread_flush_work(). Any queuing is blocked until the counter gets zero. Link: http://lkml.kernel.org/r/1470754545-17632-10-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index a5c6d709349d..20505c632bc3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -718,6 +718,19 @@ kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...) } EXPORT_SYMBOL(kthread_create_worker_on_cpu); +/* + * Returns true when the work could not be queued at the moment. + * It happens when it is already pending in a worker list + * or when it is being cancelled. + */ +static inline bool queuing_blocked(struct kthread_worker *worker, + struct kthread_work *work) +{ + lockdep_assert_held(&worker->lock); + + return !list_empty(&work->node) || work->canceling; +} + static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { @@ -759,7 +772,7 @@ bool kthread_queue_work(struct kthread_worker *worker, unsigned long flags; spin_lock_irqsave(&worker->lock, flags); - if (list_empty(&work->node)) { + if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } @@ -859,7 +872,7 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker, spin_lock_irqsave(&worker->lock, flags); - if (list_empty(&work->node)) { + if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } @@ -919,6 +932,121 @@ void kthread_flush_work(struct kthread_work *work) } EXPORT_SYMBOL_GPL(kthread_flush_work); +/* + * This function removes the work from the worker queue. Also it makes sure + * that it won't get queued later via the delayed work's timer. + * + * The work might still be in use when this function finishes. See the + * current_work proceed by the worker. + * + * Return: %true if @work was pending and successfully canceled, + * %false if @work was not pending + */ +static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, + unsigned long *flags) +{ + /* Try to cancel the timer if exists. */ + if (is_dwork) { + struct kthread_delayed_work *dwork = + container_of(work, struct kthread_delayed_work, work); + struct kthread_worker *worker = work->worker; + + /* + * del_timer_sync() must be called to make sure that the timer + * callback is not running. The lock must be temporary released + * to avoid a deadlock with the callback. In the meantime, + * any queuing is blocked by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, *flags); + del_timer_sync(&dwork->timer); + spin_lock_irqsave(&worker->lock, *flags); + work->canceling--; + } + + /* + * Try to remove the work from a worker list. It might either + * be from worker->work_list or from worker->delayed_work_list. + */ + if (!list_empty(&work->node)) { + list_del_init(&work->node); + return true; + } + + return false; +} + +static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) +{ + struct kthread_worker *worker = work->worker; + unsigned long flags; + int ret = false; + + if (!worker) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + + ret = __kthread_cancel_work(work, is_dwork, &flags); + + if (worker->current_work != work) + goto out_fast; + + /* + * The work is in progress and we need to wait with the lock released. + * In the meantime, block any queuing by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, flags); + kthread_flush_work(work); + spin_lock_irqsave(&worker->lock, flags); + work->canceling--; + +out_fast: + spin_unlock_irqrestore(&worker->lock, flags); +out: + return ret; +} + +/** + * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish + * @work: the kthread work to cancel + * + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself. On return from this + * function, @work is guaranteed to be not pending or executing on any CPU. + * + * kthread_cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. + * + * The caller must ensure that the worker on which @work was last + * queued can't be destroyed before this function returns. + * + * Return: %true if @work was pending, %false otherwise. + */ +bool kthread_cancel_work_sync(struct kthread_work *work) +{ + return __kthread_cancel_work_sync(work, false); +} +EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); + +/** + * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and + * wait for it to finish. + * @dwork: the kthread delayed work to cancel + * + * This is kthread_cancel_work_sync() for delayed works. + * + * Return: %true if @dwork was pending, %false otherwise. + */ +bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) +{ + return __kthread_cancel_work_sync(&dwork->work, true); +} +EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); + /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush -- cgit v1.2.3 From 9a6b06c8d9a220860468aadb2f1c726570813bf9 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:46 -0700 Subject: kthread: allow to modify delayed kthread work There are situations when we need to modify the delay of a delayed kthread work. For example, when the work depends on an event and the initial delay means a timeout. Then we want to queue the work immediately when the event happens. This patch implements kthread_mod_delayed_work() as inspired workqueues. It cancels the timer, removes the work from any worker list and queues it again with the given timeout. A very special case is when the work is being canceled at the same time. It might happen because of the regular kthread_cancel_delayed_work_sync() or by another kthread_mod_delayed_work(). In this case, we do nothing and let the other operation win. This should not normally happen as the caller is supposed to synchronize these operations a reasonable way. Link: http://lkml.kernel.org/r/1470754545-17632-11-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 20505c632bc3..c1fcc63fa605 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -976,6 +976,59 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, return false; } +/** + * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work + * @worker: kthread worker to use + * @dwork: kthread delayed work to queue + * @delay: number of jiffies to wait before queuing + * + * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, + * modify @dwork's timer so that it expires after @delay. If @delay is zero, + * @work is guaranteed to be queued immediately. + * + * Return: %true if @dwork was pending and its timer was modified, + * %false otherwise. + * + * A special case is when the work is being canceled in parallel. + * It might be caused either by the real kthread_cancel_delayed_work_sync() + * or yet another kthread_mod_delayed_work() call. We let the other command + * win and return %false here. The caller is supposed to synchronize these + * operations a reasonable way. + * + * This function is safe to call from any context including IRQ handler. + * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() + * for details. + */ +bool kthread_mod_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) +{ + struct kthread_work *work = &dwork->work; + unsigned long flags; + int ret = false; + + spin_lock_irqsave(&worker->lock, flags); + + /* Do not bother with canceling when never queued. */ + if (!work->worker) + goto fast_queue; + + /* Work must not be used with >1 worker, see kthread_queue_work() */ + WARN_ON_ONCE(work->worker != worker); + + /* Do not fight with another command that is canceling this work. */ + if (work->canceling) + goto out; + + ret = __kthread_cancel_work(work, true, &flags); +fast_queue: + __kthread_queue_delayed_work(worker, dwork, delay); +out: + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); + static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; -- cgit v1.2.3 From dbf52682cb02863d22b15e3742988c7c6e3f1710 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:50 -0700 Subject: kthread: better support freezable kthread workers This patch allows to make kthread worker freezable via a new @flags parameter. It will allow to avoid an init work in some kthreads. It currently does not affect the function of kthread_worker_fn() but it might help to do some optimization or fixes eventually. I currently do not know about any other use for the @flags parameter but I believe that we will want more flags in the future. Finally, I hope that it will not cause confusion with @flags member in struct kthread. Well, I guess that we will want to rework the basic kthreads implementation once all kthreads are converted into kthread workers or workqueues. It is possible that we will merge the two structures. Link: http://lkml.kernel.org/r/1470754545-17632-12-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index c1fcc63fa605..be2cc1f9dd57 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -560,11 +560,11 @@ void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { + memset(worker, 0, sizeof(struct kthread_worker)); spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); - worker->task = NULL; } EXPORT_SYMBOL_GPL(__kthread_init_worker); @@ -594,6 +594,10 @@ int kthread_worker_fn(void *worker_ptr) */ WARN_ON(worker->task && worker->task != current); worker->task = current; + + if (worker->flags & KTW_FREEZABLE) + set_freezable(); + repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ @@ -627,7 +631,8 @@ repeat: EXPORT_SYMBOL_GPL(kthread_worker_fn); static struct kthread_worker * -__kthread_create_worker(int cpu, const char namefmt[], va_list args) +__kthread_create_worker(int cpu, unsigned int flags, + const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; @@ -657,6 +662,7 @@ __kthread_create_worker(int cpu, const char namefmt[], va_list args) if (IS_ERR(task)) goto fail_task; + worker->flags = flags; worker->task = task; wake_up_process(task); return worker; @@ -668,6 +674,7 @@ fail_task: /** * kthread_create_worker - create a kthread worker + * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) @@ -675,13 +682,13 @@ fail_task: * when the worker was SIGKILLed. */ struct kthread_worker * -kthread_create_worker(const char namefmt[], ...) +kthread_create_worker(unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); - worker = __kthread_create_worker(-1, namefmt, args); + worker = __kthread_create_worker(-1, flags, namefmt, args); va_end(args); return worker; @@ -692,6 +699,7 @@ EXPORT_SYMBOL(kthread_create_worker); * kthread_create_worker_on_cpu - create a kthread worker and bind it * it to a given CPU and the associated NUMA node. * @cpu: CPU number + * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Use a valid CPU number if you want to bind the kthread worker @@ -705,13 +713,14 @@ EXPORT_SYMBOL(kthread_create_worker); * when the worker was SIGKILLed. */ struct kthread_worker * -kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...) +kthread_create_worker_on_cpu(int cpu, unsigned int flags, + const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); - worker = __kthread_create_worker(cpu, namefmt, args); + worker = __kthread_create_worker(cpu, flags, namefmt, args); va_end(args); return worker; -- cgit v1.2.3 From 48a6d64edadbd40fa5185a890023e9b331d64a48 Mon Sep 17 00:00:00 2001 From: John Siddle Date: Tue, 11 Oct 2016 13:55:56 -0700 Subject: hung_task: allow hung_task_panic when hung_task_warnings is 0 Previously hung_task_panic would not be respected if enabled after hung_task_warnings had already been decremented to 0. Permit the kernel to panic if hung_task_panic is enabled after hung_task_warnings has already been decremented to 0 and another task hangs for hung_task_timeout_secs seconds. Check if hung_task_panic is enabled so we don't return prematurely, and check if hung_task_warnings is non-zero so we don't print the warning unnecessarily. [akpm@linux-foundation.org: fix off-by-one] Link: http://lkml.kernel.org/r/1473450214-4049-1-git-send-email-jsiddle@redhat.com Signed-off-by: John Siddle Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 432c3d71d195..2b59c82cc3e1 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -98,26 +98,26 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) trace_sched_process_hang(t); - if (!sysctl_hung_task_warnings) + if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) return; - if (sysctl_hung_task_warnings > 0) - sysctl_hung_task_warnings--; - /* * Ok, the task did not get scheduled for more than 2 minutes, * complain: */ - pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", - t->comm, t->pid, timeout); - pr_err(" %s %s %.*s\n", - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); - sched_show_task(t); - debug_show_all_locks(); + if (sysctl_hung_task_warnings) { + sysctl_hung_task_warnings--; + pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", + t->comm, t->pid, timeout); + pr_err(" %s %s %.*s\n", + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); + sched_show_task(t); + debug_show_all_locks(); + } touch_nmi_watchdog(); -- cgit v1.2.3 From ef6000b4c6706cbb1787836442b5a74542b1809f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 12 Oct 2016 10:23:41 -0700 Subject: Disable the __builtin_return_address() warning globally after all This affectively reverts commit 377ccbb48373 ("Makefile: Mute warning for __builtin_return_address(>0) for tracing only") because it turns out that it really isn't tracing only - it's all over the tree. We already also had the warning disabled separately for mm/usercopy.c (which this commit also removes), and it turns out that we will also want to disable it for get_lock_parent_ip(), that is used for at least TRACE_IRQFLAGS. Which (when enabled) ends up being all over the tree. Steven Rostedt had a patch that tried to limit it to just the config options that actually triggered this, but quite frankly, the extra complexity and abstraction just isn't worth it. We have never actually had a case where the warning is actually useful, so let's just disable it globally and not worry about it. Acked-by: Steven Rostedt Cc: Thomas Gleixner Cc: Andrew Morton Cc: Ingo Molnar Cc: Peter Anvin Signed-off-by: Linus Torvalds --- kernel/trace/Makefile | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 992ab9d99f35..e57980845549 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,8 +1,4 @@ -# We are fully aware of the dangers of __builtin_return_address() -FRAME_CFLAGS := $(call cc-disable-warning,frame-address) -KBUILD_CFLAGS += $(FRAME_CFLAGS) - # Do not instrument the tracer itself: ifdef CONFIG_FUNCTION_TRACER -- cgit v1.2.3 From a705e07b9c80df27b6bb12f7a4cd4cf4ed2f728b Mon Sep 17 00:00:00 2001 From: Joonas Lahtinen Date: Wed, 12 Oct 2016 13:18:56 +0300 Subject: cpu/hotplug: Use distinct name for cpu_hotplug.dep_map Use distinctive name for cpu_hotplug.dep_map to avoid the actual cpu_hotplug.lock appearing as cpu_hotplug.lock#2 in lockdep splats. Signed-off-by: Joonas Lahtinen Reviewed-by: Chris Wilson Acked-by: Gautham R. Shenoy Cc: Andrew Morton Cc: Daniel Vetter Cc: Gautham R . Shenoy Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: trivial@kernel.org Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5df20d6d1520..29de1a9352c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -228,7 +228,7 @@ static struct { .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), #ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = {.name = "cpu_hotplug.lock" }, + .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), #endif }; -- cgit v1.2.3 From 54e23845e965898f65f76aba79fa9db76d830fa9 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 17 Oct 2016 11:47:02 +0200 Subject: alarmtimer: Remove unused but set variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the set but unused variable base in alarm_clock_get to fix the following warning when building with 'W=1': kernel/time/alarmtimer.c: In function ‘alarm_timer_create’: kernel/time/alarmtimer.c:545:21: warning: variable ‘base’ set but not used [-Wunused-but-set-variable] Signed-off-by: Tobias Klauser Cc: John Stultz Link: http://lkml.kernel.org/r/20161017094702.10873-1-tklauser@distanz.ch Signed-off-by: Thomas Gleixner --- kernel/time/alarmtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..12dd190634ab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) static int alarm_timer_create(struct k_itimer *new_timer) { enum alarmtimer_type type; - struct alarm_base *base; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) return -EPERM; type = clock2alarm(new_timer->it_clock); - base = &alarm_bases[type]; alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } -- cgit v1.2.3 From b5a9b340789b2b24c6896bcf7a065c31a4db671c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 19 Oct 2016 14:45:23 +0200 Subject: sched/fair: Fix incorrect task group ->load_avg A scheduler performance regression has been reported by Joseph Salisbury, which he bisected back to: 3d30544f0212 ("sched/fair: Apply more PELT fixes) The regression triggers when several levels of task groups are involved (read: SystemD) and cpu_possible_mask != cpu_present_mask. The root cause is that group entity's load (tg_child->se[i]->avg.load_avg) is initialized to scale_load_down(se->load.weight). During the creation of a child task group, its group entities on possible CPUs are attached to parent's cfs_rq (tg_parent) and their loads are added to the parent's load (tg_parent->load_avg) with update_tg_load_avg(). But only the load on online CPUs will then be updated to reflect real load, whereas load on other CPUs will stay at the initial value. The result is a tg_parent->load_avg that is higher than the real load, the weight of group entities (tg_parent->se[i]->load.weight) on online CPUs is smaller than it should be, and the task group gets a less running time than what it could expect. ( This situation can be detected with /proc/sched_debug. The ".tg_load_avg" of the task group will be much higher than sum of ".tg_load_avg_contrib" of online cfs_rqs of the task group. ) The load of group entities don't have to be intialized to something else than 0 because their load will increase when an entity is attached. Reported-by: Joseph Salisbury Tested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Cc: # 4.8.x Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: joonwoop@codeaurora.org Fixes: 3d30544f0212 ("sched/fair: Apply more PELT fixes) Link: http://lkml.kernel.org/r/1476881123-10159-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76ee7de1859d..d941c97dfbc3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway -- cgit v1.2.3 From 9beae1ea89305a9667ceaab6d0bf46a045ad71e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:17 +0100 Subject: mm: replace get_user_pages_remote() write/force parameters with gup_flags This removes the 'write' and 'force' from get_user_pages_remote() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d4129bb05e5d..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, + &vma); if (ret <= 0) return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + NULL); if (result < 0) return result; -- cgit v1.2.3 From f307ab6dcea03f9d8e4d70508fd7d1ca57cfa7f9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:20 +0100 Subject: mm: replace access_process_vm() write parameter with gup_flags This removes the 'write' argument from access_process_vm() and replaces it with 'gup_flags' as use of this function previously silently implied FOLL_FORCE, whereas after this patch callers explicitly pass this flag. We make this explicit as use of FOLL_FORCE can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Jesper Nilsson Acked-by: Michal Hocko Acked-by: Michael Ellerman Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2a99027312a6..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); + retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; @@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); + retval = access_process_vm(tsk, dst, buf, this_len, + FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; @@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + copied = access_process_vm(tsk, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); + ret = access_process_vm(child, addr, &word, sizeof(word), + FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else @@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; -- cgit v1.2.3 From 8835ca59dac2bc1e0136791abf3ccd51588803ce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 19 Oct 2016 09:11:24 -0700 Subject: printk: suppress empty continuation lines We have a fairly common pattern where you print several things as continuations on one single line in a loop, and then at the end you do printk(KERN_CONT "\n"); to flush the buffered output. But if the output was flushed by something else (concurrent printk activity, or just system logging), we don't want that final flushing to just print an empty line. So just suppress empty continuation lines when they couldn't be merged into the line they are a continuation of. Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d5e397315473..de08fc90baaf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1769,6 +1769,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c cont_flush(); } + /* Skip empty continuation lines that couldn't be added - they just flush */ + if (!text_len && (lflags & LOG_CONT)) + return 0; + /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { if (cont_add(facility, level, lflags, text, text_len)) -- cgit v1.2.3 From 3118dac501bc0317de099db81618d589503351e1 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 6 Oct 2016 23:06:43 +0530 Subject: kernel/irq: Export irq_set_parent() The TPS65217 driver grew interrupt support which uses irq_set_parent(). While it's not yet clear why this is used in the first place, building the driver as a module fails with: ERROR: ".irq_set_parent" [drivers/mfd/tps65217.ko] undefined! The correctness of the driver change is still investigated, but for now it's less trouble to export irq_set_parent() than dealing with the build wreckage. [ tglx: Rewrote changelog and made the export GPL ] Fixes: 6556bdacf646 ("mfd: tps65217: Add support for IRQs") Signed-off-by: Sudip Mukherjee Cc: Sudip Mukherjee Cc: Marcin Niestroj Cc: Grygorii Strashko Cc: Tony Lindgren Cc: Lee Jones Link: http://lkml.kernel.org/r/1475775403-27207-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0c5f1a5db654..9c4d30483264 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq) irq_put_desc_unlock(desc, flags); return 0; } +EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* -- cgit v1.2.3 From 1adb469b9b76276d7e5ea36a20a24c47d6618a0b Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Fri, 21 Oct 2016 16:24:09 +0100 Subject: PM / suspend: Fix missing KERN_CONT for suspend message Commit 4bcc595ccd80 (printk: reinstate KERN_CONT for printing continuation lines) exposed a missing KERN_CONT from one of the messages shown on entering suspend. With v4.9-rc1, the 'done.' shown after syncing the filesystems no longer appears as a continuation but a new message with its own timestamp. [ 9.259566] PM: Syncing filesystems ... [ 9.264119] done. Fix this by adding the KERN_CONT log level for the 'done.' part of the message seen after syncing filesystems. While we are at it, convert these suspend printks to pr_info and pr_cont, respectively. Signed-off-by: Jon Hunter Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1e7f5da648d9..6ccb08f57fcb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); - printk(KERN_INFO "PM: Syncing filesystems ... "); + pr_info("PM: Syncing filesystems ... "); sys_sync(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif -- cgit v1.2.3