diff options
author | Marcel Ziswiler <marcel.ziswiler@toradex.com> | 2020-02-04 11:31:11 +0100 |
---|---|---|
committer | Marcel Ziswiler <marcel.ziswiler@toradex.com> | 2020-02-07 13:50:20 +0100 |
commit | 500f9a04cd76f504285ad99b15e390e20ab17e0c (patch) | |
tree | e1c5ff3a78e75f0847b47fbac63d279dd411e7e5 /kernel | |
parent | 0f549d8c4d5edd68a2a492afd48228458d3bca4a (diff) | |
parent | 6d0c334a400db31751c787c411e7187ab59a3f1d (diff) |
Merge tag 'v4.14.164' into 4.14-2.3.x-imx
This is the 4.14.164 stable release
Conflicts:
arch/arm/Kconfig.debug
arch/arm/boot/dts/imx7s.dtsi
arch/arm/mach-imx/cpuidle-imx6q.c
arch/arm/mach-imx/cpuidle-imx6sx.c
arch/arm64/kernel/cpu_errata.c
arch/arm64/kvm/hyp/tlb.c
drivers/crypto/caam/caamalg.c
drivers/crypto/mxs-dcp.c
drivers/dma/imx-sdma.c
drivers/gpio/gpio-vf610.c
drivers/gpu/drm/bridge/adv7511/adv7511_drv.c
drivers/input/keyboard/imx_keypad.c
drivers/input/keyboard/snvs_pwrkey.c
drivers/mmc/core/block.c
drivers/mmc/core/queue.h
drivers/mmc/host/sdhci-esdhc-imx.c
drivers/net/can/flexcan.c
drivers/net/can/rx-offload.c
drivers/net/ethernet/freescale/fec_main.c
drivers/net/wireless/ath/ath10k/pci.c
drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
drivers/pci/dwc/pci-imx6.c
drivers/spi/spi-fsl-lpspi.c
drivers/usb/dwc3/gadget.c
include/net/tcp.h
sound/soc/fsl/Kconfig
sound/soc/fsl/fsl_esai.c
Diffstat (limited to 'kernel')
94 files changed, 2844 insertions, 1228 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 172d151d429c..43e92e3691ec 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -48,9 +49,6 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_FUTEX) += futex.o -ifeq ($(CONFIG_COMPAT),y) -obj-$(CONFIG_FUTEX) += futex_compat.o -endif obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 4a98f6e314a9..35f1d706bd5b 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -365,12 +365,12 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d->d_sb->s_dev; watch->ino = d_backing_inode(d)->i_ino; } + inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8dd4063647c2..215c6e1ee026 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1094,22 +1094,24 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) int err = 0; struct audit_entry *entry; - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - switch (type) { case AUDIT_ADD_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_add_rule(entry); audit_log_rule_change("add_rule", &entry->rule, !err); break; case AUDIT_DEL_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_del_rule(entry); audit_log_rule_change("remove_rule", &entry->rule, !err); break; default: - err = -EINVAL; WARN_ON(1); + return -EINVAL; } if (err || type == AUDIT_DEL_RULE) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 76d789d6cea0..ffa8d64f6fef 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1102,7 +1102,7 @@ static void audit_log_execve_info(struct audit_context *context, } /* write as much as we can to the audit log */ - if (len_buf > 0) { + if (len_buf >= 0) { /* NOTE: some magic numbers here - basically if we * can't fit a reasonable amount of data into the * existing audit buffer, flush it and start with diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index af3ab6164ff5..be282c135a66 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o +CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d203a5d6b726..e7211b0fa27c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -51,6 +51,7 @@ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] +#define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define IMM insn->imm @@ -289,6 +290,12 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +/* All BPF JIT sysctl knobs here. */ +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_harden __read_mostly; +int bpf_jit_kallsyms __read_mostly; +long bpf_jit_limit __read_mostly; + static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, unsigned long *symbol_start, @@ -357,8 +364,6 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -int bpf_jit_kallsyms __read_mostly; - static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) { WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); @@ -485,27 +490,75 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +static atomic_long_t bpf_jit_current; + +/* Can be overridden by an arch's JIT compiler if it has a custom, + * dedicated BPF backend memory area, or if neither of the two + * below apply. + */ +u64 __weak bpf_jit_alloc_exec_limit(void) +{ +#if defined(MODULES_VADDR) + return MODULES_END - MODULES_VADDR; +#else + return VMALLOC_END - VMALLOC_START; +#endif +} + +static int __init bpf_jit_charge_init(void) +{ + /* Only used as heuristic here to derive limit. */ + bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2, + PAGE_SIZE), LONG_MAX); + return 0; +} +pure_initcall(bpf_jit_charge_init); + +static int bpf_jit_charge_modmem(u32 pages) +{ + if (atomic_long_add_return(pages, &bpf_jit_current) > + (bpf_jit_limit >> PAGE_SHIFT)) { + if (!capable(CAP_SYS_ADMIN)) { + atomic_long_sub(pages, &bpf_jit_current); + return -EPERM; + } + } + + return 0; +} + +static void bpf_jit_uncharge_modmem(u32 pages) +{ + atomic_long_sub(pages, &bpf_jit_current); +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - unsigned int size, hole, start; + u32 size, hole, start, pages; /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + pages = size / PAGE_SIZE; + + if (bpf_jit_charge_modmem(pages)) + return NULL; hdr = module_alloc(size); - if (hdr == NULL) + if (!hdr) { + bpf_jit_uncharge_modmem(pages); return NULL; + } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = size / PAGE_SIZE; + hdr->pages = pages; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -518,7 +571,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { + u32 pages = hdr->pages; + module_memfree(hdr); + bpf_jit_uncharge_modmem(pages); } /* This symbol is only overridden by archs that have different @@ -539,8 +595,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } -int bpf_jit_harden __read_mostly; - static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) @@ -552,6 +606,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + /* Constraints on AX register: + * + * AX register is inaccessible from user space. It is mapped in + * all JITs, and used here for constant blinding rewrites. It is + * typically "stateless" meaning its contents are only valid within + * the executed instruction, but not across several instructions. + * There are a few exceptions however which are further detailed + * below. + * + * Constant blinding is only used by JITs, not in the interpreter. + * The interpreter uses AX in some occasions as a local temporary + * register e.g. in DIV or MOD instructions. + * + * In restricted circumstances, the verifier can also use the AX + * register for rewrites as long as they do not interfere with + * the above cases! + */ + if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) + goto out; + if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { @@ -939,22 +1013,22 @@ select_insn: ALU64_MOD_X: if (unlikely(SRC == 0)) return 0; - div64_u64_rem(DST, SRC, &tmp); - DST = tmp; + div64_u64_rem(DST, SRC, &AX); + DST = AX; CONT; ALU_MOD_X: if (unlikely((u32)SRC == 0)) return 0; - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &tmp); - DST = tmp; + div64_u64_rem(DST, IMM, &AX); + DST = AX; CONT; ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); CONT; ALU64_DIV_X: if (unlikely(SRC == 0)) @@ -964,17 +1038,17 @@ select_insn: ALU_DIV_X: if (unlikely((u32)SRC == 0)) return 0; - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; CONT; ALU_END_TO_BE: switch (IMM) { @@ -1278,7 +1352,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ @@ -1306,9 +1380,13 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #else -static unsigned int __bpf_prog_ret0(const void *ctx, - const struct bpf_insn *insn) +static unsigned int __bpf_prog_ret0_warn(const void *ctx, + const struct bpf_insn *insn) { + /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON + * is not working properly, so warn about it! + */ + WARN_ON_ONCE(1); return 0; } #endif @@ -1365,7 +1443,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; #else - fp->bpf_func = __bpf_prog_ret0; + fp->bpf_func = __bpf_prog_ret0_warn; #endif /* eBPF JITs can rewrite the program in case constant diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e745d6a88224..1060eee6c8d5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -156,6 +156,9 @@ static void dev_map_free(struct bpf_map *map) synchronize_rcu(); + /* Make sure prior __dev_map_entry_free() have completed. */ + rcu_barrier(); + /* To ensure all pending flush operations have completed wait for flush * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected @@ -385,8 +388,7 @@ static int dev_map_notification(struct notifier_block *notifier, struct bpf_dtab_netdev *dev, *odev; dev = READ_ONCE(dtab->netdev_map[i]); - if (!dev || - dev->dev->ifindex != netdev->ifindex) + if (!dev || netdev != dev->dev) continue; odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); if (dev == odev) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3d0ecc273cc6..505e69854eb8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -498,18 +498,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } -static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, + void *key, const bool mark) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) { - bpf_lru_node_set_ref(&l->lru_node); + if (mark) + bpf_lru_node_set_ref(&l->lru_node); return l->key + round_up(map->key_size, 8); } return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + return __htab_lru_map_lookup_elem(map, key, true); +} + +static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) +{ + return __htab_lru_map_lookup_elem(map, key, false); +} + static u32 htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -655,7 +667,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } if (htab_is_prealloc(htab)) { - pcpu_freelist_push(&htab->freelist, &l->fnode); + __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); l->htab = htab; @@ -717,7 +729,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } else { struct pcpu_freelist_node *l; - l = pcpu_freelist_pop(&htab->freelist); + l = __pcpu_freelist_pop(&htab->freelist); if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); @@ -1160,6 +1172,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_map_lookup_elem, + .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, @@ -1190,7 +1203,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -1206,8 +1218,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; - if (htab_is_lru(htab)) - bpf_lru_node_set_ref(&l->lru_node); + /* We do not mark LRU map element here in order to not mess up + * eviction heuristics when user space does a map walk. + */ pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be1dde967208..ccf9ffd5da78 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -365,19 +365,6 @@ out: } EXPORT_SYMBOL_GPL(bpf_obj_get_user); -static void bpf_evict_inode(struct inode *inode) -{ - enum bpf_type type; - - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); - if (!bpf_inode_type(inode, &type)) - bpf_any_put(inode->i_private, type); -} - /* * Display the mount options in /proc/mounts. */ @@ -390,11 +377,28 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void bpf_destroy_inode_deferred(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + enum bpf_type type; + + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + if (!bpf_inode_type(inode, &type)) + bpf_any_put(inode->i_private, type); + free_inode_nonrcu(inode); +} + +static void bpf_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred); +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .evict_inode = bpf_evict_inode, + .destroy_inode = bpf_destroy_inode, }; enum { diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 1da574612bea..c0c494b7647b 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; + u32 inner_map_meta_size; struct fd f; f = fdget(inner_map_ufd); @@ -34,7 +35,12 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } - inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + inner_map_meta_size = sizeof(*inner_map_meta); + /* In some cases verifier needs to access beyond just base map. */ + if (inner_map->ops == &array_map_ops) + inner_map_meta_size = sizeof(struct bpf_array); + + inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { fdput(f); return ERR_PTR(-ENOMEM); @@ -44,9 +50,16 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; - inner_map_meta->ops = inner_map->ops; inner_map_meta->max_entries = inner_map->max_entries; + /* Misc members not needed in bpf_map_meta_equal() check. */ + inner_map_meta->ops = inner_map->ops; + if (inner_map->ops == &array_map_ops) { + inner_map_meta->unpriv_array = inner_map->unpriv_array; + container_of(inner_map_meta, struct bpf_array, map)->index_mask = + container_of(inner_map, struct bpf_array, map)->index_mask; + } + fdput(f); return inner_map_meta; } diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 673fa6fe2d73..0c1b4ba9e90e 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s) free_percpu(s->freelist); } -static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, - struct pcpu_freelist_node *node) +static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); node->next = head->first; @@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, raw_spin_unlock(&head->lock); } -void pcpu_freelist_push(struct pcpu_freelist *s, +void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); - __pcpu_freelist_push(head, node); + ___pcpu_freelist_push(head, node); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + unsigned long flags; + + local_irq_save(flags); + __pcpu_freelist_push(s, node); + local_irq_restore(flags); } void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, @@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, for_each_possible_cpu(cpu) { again: head = per_cpu_ptr(s->freelist, cpu); - __pcpu_freelist_push(head, buf); + ___pcpu_freelist_push(head, buf); i++; buf += elem_size; if (i == nr_elems) @@ -74,14 +84,12 @@ again: local_irq_restore(flags); } -struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - unsigned long flags; int orig_cpu, cpu; - local_irq_save(flags); orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); @@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) node = head->first; if (node) { head->first = node->next; - raw_spin_unlock_irqrestore(&head->lock, flags); + raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; - if (cpu == orig_cpu) { - local_irq_restore(flags); + if (cpu == orig_cpu) return NULL; - } } } + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_node *ret; + unsigned long flags; + + local_irq_save(flags); + ret = __pcpu_freelist_pop(s); + local_irq_restore(flags); + return ret; +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index 3049aae8ea1e..c3960118e617 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -22,8 +22,12 @@ struct pcpu_freelist_node { struct pcpu_freelist_node *next; }; +/* pcpu_freelist_* do spin_lock_irqsave. */ void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */ +void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *); void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems); int pcpu_freelist_init(struct pcpu_freelist *); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5c9deed4524e..f5c1d5479ba3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -348,12 +348,12 @@ static int map_create(union bpf_attr *attr) err = bpf_map_new_fd(map); if (err < 0) { /* failed to allocate fd. - * bpf_map_put() is needed because the above + * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ - bpf_map_put(map); + bpf_map_put_with_uref(map); return err; } @@ -493,7 +493,10 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_htab_map_lookup_elem(map, key, value); } else { rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); + if (map->ops->map_lookup_elem_sys_only) + ptr = map->ops->map_lookup_elem_sys_only(map, key); + else + ptr = map->ops->map_lookup_elem(map, key); if (ptr) memcpy(value, ptr, value_size); rcu_read_unlock(); @@ -1064,20 +1067,26 @@ static int bpf_prog_load(union bpf_attr *attr) if (err) goto free_used_maps; - err = bpf_prog_new_fd(prog); - if (err < 0) { - /* failed to allocate fd. - * bpf_prog_put() is needed because the above - * bpf_prog_alloc_id() has published the prog - * to the userspace and the userspace may - * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. - */ - bpf_prog_put(prog); - return err; - } - + /* Upon success of bpf_prog_alloc_id(), the BPF prog is + * effectively publicly exposed. However, retrieving via + * bpf_prog_get_fd_by_id() will take another reference, + * therefore it cannot be gone underneath us. + * + * Only for the time /after/ successful bpf_prog_new_fd() + * and before returning to userspace, we might just hold + * one reference and any parallel close on that fd could + * rip everything out. Hence, below notifications must + * happen before bpf_prog_new_fd(). + * + * Also, any failure handling from this point onwards must + * be using bpf_prog_put() given the program is exposed. + */ bpf_prog_kallsyms_add(prog); trace_bpf_prog_load(prog, err); + + err = bpf_prog_new_fd(prog); + if (err < 0) + bpf_prog_put(prog); return err; free_used_maps: @@ -1345,7 +1354,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) fd = bpf_map_new_fd(map); if (fd < 0) - bpf_map_put(map); + bpf_map_put_with_uref(map); return fd; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6755fd5bae2..615a2e44d2a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -265,10 +265,11 @@ static void print_verifier_state(struct bpf_verifier_state *state) verbose(")"); } } - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] == STACK_SPILL) - verbose(" fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] == STACK_SPILL) + verbose(" fp%d=%s", + (-i - 1) * BPF_REG_SIZE, + reg_type_str[state->stack[i].spilled_ptr.type]); } verbose("\n"); } @@ -434,40 +435,133 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, } } -static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) +static int copy_stack_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) { - struct bpf_verifier_stack_elem *elem; - int insn_idx; + if (!src->stack) + return 0; + if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { + /* internal bug, make state invalid to reject the program */ + memset(dst, 0, sizeof(*dst)); + return -EFAULT; + } + memcpy(dst->stack, src->stack, + sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); + return 0; +} + +/* do_check() starts with zero-sized stack in struct bpf_verifier_state to + * make it consume minimal amount of memory. check_stack_write() access from + * the program calls into realloc_verifier_state() to grow the stack size. + * Note there is a non-zero 'parent' pointer inside bpf_verifier_state + * which this function copies over. It points to previous bpf_verifier_state + * which is never reallocated + */ +static int realloc_verifier_state(struct bpf_verifier_state *state, int size, + bool copy_old) +{ + u32 old_size = state->allocated_stack; + struct bpf_stack_state *new_stack; + int slot = size / BPF_REG_SIZE; + + if (size <= old_size || !size) { + if (copy_old) + return 0; + state->allocated_stack = slot * BPF_REG_SIZE; + if (!size && old_size) { + kfree(state->stack); + state->stack = NULL; + } + return 0; + } + new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), + GFP_KERNEL); + if (!new_stack) + return -ENOMEM; + if (copy_old) { + if (state->stack) + memcpy(new_stack, state->stack, + sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); + memset(new_stack + old_size / BPF_REG_SIZE, 0, + sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); + } + state->allocated_stack = slot * BPF_REG_SIZE; + kfree(state->stack); + state->stack = new_stack; + return 0; +} + +static void free_verifier_state(struct bpf_verifier_state *state, + bool free_self) +{ + kfree(state->stack); + if (free_self) + kfree(state); +} + +/* copy verifier state from src to dst growing dst stack space + * when necessary to accommodate larger src stack + */ +static int copy_verifier_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) +{ + int err; + + err = realloc_verifier_state(dst, src->allocated_stack, false); + if (err) + return err; + memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + return copy_stack_state(dst, src); +} + +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, + int *insn_idx) +{ + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_stack_elem *elem, *head = env->head; + int err; if (env->head == NULL) - return -1; + return -ENOENT; - memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); - insn_idx = env->head->insn_idx; + if (cur) { + err = copy_verifier_state(cur, &head->st); + if (err) + return err; + } + if (insn_idx) + *insn_idx = head->insn_idx; if (prev_insn_idx) - *prev_insn_idx = env->head->prev_insn_idx; - elem = env->head->next; - kfree(env->head); + *prev_insn_idx = head->prev_insn_idx; + elem = head->next; + free_verifier_state(&head->st, false); + kfree(head); env->head = elem; env->stack_size--; - return insn_idx; + return 0; } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) + int insn_idx, int prev_insn_idx, + bool speculative) { struct bpf_verifier_stack_elem *elem; + struct bpf_verifier_state *cur = env->cur_state; + int err; - elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; - memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; + elem->st.speculative |= speculative; env->head = elem; env->stack_size++; + err = copy_verifier_state(&elem->st, cur); + if (err) + goto err; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose("BPF program is too complex\n"); goto err; @@ -475,7 +569,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, return &elem->st; err: /* pop all elements and return */ - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); return NULL; } @@ -671,7 +765,7 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state->regs; if (regno >= MAX_BPF_REG) { verbose("R%d is invalid\n", regno); @@ -684,7 +778,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose("R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(&env->cur_state, regno); + mark_reg_read(env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -721,10 +815,21 @@ static int check_stack_write(struct bpf_verifier_env *env, struct bpf_verifier_state *state, int off, int size, int value_regno, int insn_idx) { - int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + + err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); + if (err) + return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ + if (!env->allow_ptr_leaks && + state->stack[spi].slot_type[0] == STACK_SPILL && + size != BPF_REG_SIZE) { + verbose("attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } if (value_regno >= 0 && is_spillable_regtype(state->regs[value_regno].type)) { @@ -736,11 +841,11 @@ static int check_stack_write(struct bpf_verifier_env *env, } /* save register state */ - state->spilled_regs[spi] = state->regs[value_regno]; - state->spilled_regs[spi].live |= REG_LIVE_WRITTEN; + state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) { - if (state->stack_slot_type[MAX_BPF_STACK + off + i] == STACK_MISC && + if (state->stack[spi].slot_type[i] == STACK_MISC && !env->allow_ptr_leaks) { int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; int soff = (-spi - 1) * BPF_REG_SIZE; @@ -763,14 +868,15 @@ static int check_stack_write(struct bpf_verifier_env *env, } *poff = soff; } - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + state->stack[spi].slot_type[i] = STACK_SPILL; } } else { /* regular write of data into stack */ - state->spilled_regs[spi] = (struct bpf_reg_state) {}; + state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; for (i = 0; i < size; i++) - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; + state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = + STACK_MISC; } return 0; } @@ -781,10 +887,10 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN) + if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->spilled_regs[slot].live |= REG_LIVE_READ; + parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; } @@ -793,34 +899,37 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo static int check_stack_read(struct bpf_verifier_state *state, int off, int size, int value_regno) { - u8 *slot_type; - int i, spi; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + u8 *stype; - slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; + if (state->allocated_stack <= slot) { + verbose("invalid read from stack off %d+0 size %d\n", + off, size); + return -EACCES; + } + stype = state->stack[spi].slot_type; - if (slot_type[0] == STACK_SPILL) { + if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { verbose("invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { - if (slot_type[i] != STACK_SPILL) { + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { verbose("corrupted spill memory\n"); return -EACCES; } } - spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; - if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->spilled_regs[spi]; + state->regs[value_regno] = state->stack[spi].spilled_ptr; mark_stack_slot_read(state, spi); } return 0; } else { for (i = 0; i < size; i++) { - if (slot_type[i] != STACK_MISC) { + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { verbose("invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -833,11 +942,37 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } } +static int check_stack_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int off, int size) +{ + /* Stack accesses must be at a fixed offset, so that we + * can determine what type of data were returned. See + * check_stack_read(). + */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("variable stack access var_off=%s off=%d size=%d", + tn_buf, off, size); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose("invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { verbose("invalid access to map value, value_size=%d off=%d size=%d\n", @@ -849,9 +984,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -861,13 +996,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, */ if (log_level) print_verifier_state(state); + /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0) { + if (reg->smin_value < 0 && + (reg->smin_value == S64_MIN || + (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || + reg->smin_value + off < 0)) { verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; @@ -924,7 +1063,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { @@ -938,7 +1077,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; int err; @@ -1008,19 +1147,19 @@ static bool __is_pointer_value(bool allow_ptr_leaks, static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { - return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); + return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = &env->cur_state.regs[regno]; + const struct bpf_reg_state *reg = cur_regs(env) + regno; return reg->type == PTR_TO_CTX; } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = &env->cur_state.regs[regno]; + const struct bpf_reg_state *reg = cur_regs(env) + regno; return reg->type == PTR_TO_PACKET; } @@ -1112,6 +1251,30 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); } +static int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + /* Access to ctx or passing it to a helper is only allowed in + * its original, unmodified form. + */ + + if (reg->off) { + verbose("dereference of modified ctx ptr R%d off=%d disallowed\n", + regno, reg->off); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("variable ctx access var_off=%s disallowed\n", tn_buf); + return -EACCES; + } + + return 0; +} + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -1145,8 +1308,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once) { - struct bpf_verifier_state *state = &env->cur_state; - struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_verifier_state *state = env->cur_state; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -1170,7 +1334,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -1180,22 +1344,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } - /* ctx accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - */ - if (reg->off) { - verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", - regno, reg->off, off - reg->off); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable ctx access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a @@ -1203,49 +1355,29 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(regs, value_regno); else - mark_reg_known_zero(state->regs, value_regno); - state->regs[value_regno].id = 0; - state->regs[value_regno].off = 0; - state->regs[value_regno].range = 0; - state->regs[value_regno].type = reg_type; + mark_reg_known_zero(regs, value_regno); + regs[value_regno].id = 0; + regs[value_regno].off = 0; + regs[value_regno].range = 0; + regs[value_regno].type = reg_type; } } else if (reg->type == PTR_TO_STACK) { - /* stack accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - * See check_stack_read(). - */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable stack access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } off += reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose("invalid stack off=%d size=%d\n", off, size); - return -EACCES; - } + err = check_stack_access(env, reg, off, size); + if (err) + return err; if (env->prog->aux->stack_depth < -off) env->prog->aux->stack_depth = -off; - if (t == BPF_WRITE) { - if (!env->allow_ptr_leaks && - state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && - size != BPF_REG_SIZE) { - verbose("attempt to corrupt spilled pointer on stack\n"); - return -EACCES; - } + if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, value_regno, insn_idx); - } else { + else err = check_stack_read(state, off, size, value_regno); - } } else if (reg->type == PTR_TO_PACKET) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); @@ -1258,7 +1390,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(regs, value_regno); } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1266,9 +1398,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && - state->regs[value_regno].type == SCALAR_VALUE) { + regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ - coerce_reg_to_size(&state->regs[value_regno], size); + coerce_reg_to_size(®s[value_regno], size); } return err; } @@ -1333,9 +1465,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs; - int off, i; + int off, i, slot, spi; if (regs[regno].type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -1376,7 +1508,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { - if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { + slot = -(off + i) - 1; + spi = slot / BPF_REG_SIZE; + if (state->allocated_stack <= slot || + state->stack[spi].slot_type[slot % BPF_REG_SIZE] != + STACK_MISC) { verbose("invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; @@ -1389,7 +1525,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; switch (reg->type) { case PTR_TO_PACKET: @@ -1406,7 +1542,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; int err = 0; @@ -1449,6 +1585,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; } else if (arg_type == ARG_PTR_TO_MEM || arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; @@ -1678,7 +1817,7 @@ static int check_raw_mode(const struct bpf_func_proto *fn) */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -1687,10 +1826,10 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) regs[i].type == PTR_TO_PACKET_END) mark_reg_unknown(regs, i); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; + reg = &state->stack[i].spilled_ptr; if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_PACKET_END) continue; @@ -1700,9 +1839,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { - struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1776,6 +1914,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return err; } + regs = cur_regs(env); /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(regs, caller_saved[i]); @@ -1880,6 +2019,125 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, + u32 *ptr_limit, u8 opcode, bool off_is_neg) +{ + bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + u32 off; + + switch (ptr_reg->type) { + case PTR_TO_STACK: + off = ptr_reg->off + ptr_reg->var_off.value; + if (mask_to_left) + *ptr_limit = MAX_BPF_STACK + off; + else + *ptr_limit = -off; + return 0; + case PTR_TO_MAP_VALUE: + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->value_size - off; + } + return 0; + default: + return -EINVAL; + } +} + +static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; +} + +static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, + u32 alu_state, u32 alu_limit) +{ + /* If we arrived here from different branches with different + * state or limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + return 0; +} + +static int sanitize_val_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); +} + +static int sanitize_ptr_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn, + const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *dst_reg, + bool off_is_neg) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_insn_aux_data *aux = cur_aux(env); + bool ptr_is_dst_reg = ptr_reg == dst_reg; + u8 opcode = BPF_OP(insn->code); + u32 alu_state, alu_limit; + struct bpf_reg_state tmp; + bool ret; + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + /* We already marked aux for masking from non-speculative + * paths, thus we got here in the first place. We only care + * to explore bad access from here. + */ + if (vstate->speculative) + goto do_sim; + + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) + return 0; + if (update_alu_sanitation_state(aux, alu_state, alu_limit)) + return -EACCES; +do_sim: + /* Simulate and find potential out-of-bounds access under + * speculative execution from truncation as a result of + * masking when off was not within expected range. If off + * sits in dst, then we temporarily need to move ptr there + * to simulate dst (== 0) +/-= ptr. Needed, for example, + * for cases where we use K-based arithmetic in one direction + * and truncated reg-based in the other in order to explore + * bad access. + */ + if (!ptr_is_dst_reg) { + tmp = *dst_reg; + *dst_reg = *ptr_reg; + } + ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + if (!ptr_is_dst_reg && ret) + *dst_reg = tmp; + return !ret ? -EFAULT : 0; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -1890,14 +2148,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); - u32 dst = insn->dst_reg; + int ret; dst_reg = ®s[dst]; @@ -1949,6 +2208,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose("R%d tried to add from different maps or paths\n", dst); + return ret; + } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -1999,6 +2263,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose("R%d tried to sub from different maps or paths\n", dst); + return ret; + } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ if (!env->allow_ptr_leaks) @@ -2071,6 +2340,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose("R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; + case PTR_TO_MAP_VALUE: + if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { + verbose("R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", + off_reg == dst_reg ? dst : src); + return -EACCES; + } + /* fall-through */ default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ if (!env->allow_ptr_leaks) @@ -2085,6 +2361,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); + + /* For unprivileged we require that resulting offset must be in bounds + * in order to be able to sanitize access later on. + */ + if (!env->allow_ptr_leaks) { + if (dst_reg->type == PTR_TO_MAP_VALUE && + check_map_access(env, dst, dst_reg->off, 1)) { + verbose("R%d pointer arithmetic of map value goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } else if (dst_reg->type == PTR_TO_STACK && + check_stack_access(env, dst_reg, dst_reg->off + + dst_reg->var_off.value, 1)) { + verbose("R%d stack pointer arithmetic goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } + } + return 0; } @@ -2097,12 +2392,14 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); bool src_known, dst_known; s64 smin_val, smax_val; u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + u32 dst = insn->dst_reg; + int ret; if (insn_bitness == 32) { /* Relevant for 32-bit RSH: Information can propagate towards @@ -2137,6 +2434,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose("R%d tried to add from different pointers or scalars\n", dst); + return ret; + } if (signed_add_overflows(dst_reg->smin_value, smin_val) || signed_add_overflows(dst_reg->smax_value, smax_val)) { dst_reg->smin_value = S64_MIN; @@ -2156,6 +2458,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose("R%d tried to sub from different pointers or scalars\n", dst); + return ret; + } if (signed_sub_overflows(dst_reg->smin_value, smax_val) || signed_sub_overflows(dst_reg->smax_value, smin_val)) { /* Overflow possible, we know nothing */ @@ -2345,7 +2652,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); int rc; @@ -2419,12 +2726,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(&env->cur_state); + print_verifier_state(env->cur_state); verbose("verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(&env->cur_state); + print_verifier_state(env->cur_state); verbose("verifier internal error: no src_reg\n"); return -EINVAL; } @@ -2434,7 +2741,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); int err; @@ -2661,10 +2968,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; + reg = &state->stack[i].spilled_ptr; if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) reg->range = max(reg->range, new_range); } @@ -2914,17 +3221,17 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null); + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); } } static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; + struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -2984,7 +3291,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, + false); if (!other_branch) return -EFAULT; @@ -3087,7 +3395,7 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -3148,7 +3456,8 @@ static bool may_access_skb(enum bpf_prog_type type) */ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); + static const int ctx_reg = BPF_REG_6; u8 mode = BPF_MODE(insn->code); int i, err; @@ -3165,11 +3474,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check whether implicit source operand (register R6) is readable */ - err = check_reg_arg(env, BPF_REG_6, SRC_OP); + err = check_reg_arg(env, ctx_reg, SRC_OP); if (err) return err; - if (regs[BPF_REG_6].type != PTR_TO_CTX) { + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; } @@ -3181,6 +3490,10 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } + err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + if (err < 0) + return err; + /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(regs, caller_saved[i]); @@ -3534,6 +3847,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } +static bool stacksafe(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + struct idpair *idmap) +{ + int i, spi; + + /* if explored stack has more populated slots than current stack + * such stacks are not equivalent + */ + if (old->allocated_stack > cur->allocated_stack) + return false; + + /* walk slots of the explored stack and ignore any additional + * slots in the current stack, since explored(safe) state + * didn't use them + */ + for (i = 0; i < old->allocated_stack; i++) { + spi = i / BPF_REG_SIZE; + + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) + continue; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ + return false; + if (i % BPF_REG_SIZE) + continue; + if (old->stack[spi].slot_type[0] != STACK_SPILL) + continue; + if (!regsafe(&old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, + idmap)) + /* when explored and current stack slot are both storing + * spilled registers, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} + * but current path has stored: + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + } + return true; +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -3568,6 +3932,12 @@ static bool states_equal(struct bpf_verifier_env *env, bool ret = false; int i; + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); /* If we failed to allocate the idmap, just say it's not safe */ if (!idmap) @@ -3578,37 +3948,8 @@ static bool states_equal(struct bpf_verifier_env *env, goto out_free; } - for (i = 0; i < MAX_BPF_STACK; i++) { - if (old->stack_slot_type[i] == STACK_INVALID) - continue; - if (old->stack_slot_type[i] != cur->stack_slot_type[i]) - /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has has STACK_MISC -> - * this verifier states are not equivalent, - * return false to continue verification of this path - */ - goto out_free; - if (i % BPF_REG_SIZE) - continue; - if (old->stack_slot_type[i] != STACK_SPILL) - continue; - if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE], - &cur->spilled_regs[i / BPF_REG_SIZE], - idmap)) - /* when explored and current stack slot are both storing - * spilled registers, check that stored pointers types - * are the same as well. - * Ex: explored safe path could have stored - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} - * but current path has stored: - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} - * such verifier states are not equivalent. - * return false to continue verification of this path - */ - goto out_free; - else - continue; - } + if (!stacksafe(old, cur, idmap)) + goto out_free; ret = true; out_free: kfree(idmap); @@ -3644,17 +3985,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state, } } /* ... and stack slots */ - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) { - if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].slot_type[0] != STACK_SPILL) continue; - if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - if (parent->spilled_regs[i].live & REG_LIVE_READ) + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; - if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN)) + if (writes && + (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) continue; - if (state->spilled_regs[i].live & REG_LIVE_READ) { - parent->spilled_regs[i].live |= REG_LIVE_READ; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { + parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; touched = true; } } @@ -3684,7 +4027,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - int i; + struct bpf_verifier_state *cur = env->cur_state; + int i, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -3694,7 +4038,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(env, &sl->state, &env->cur_state)) { + if (states_equal(env, &sl->state, cur)) { /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -3705,7 +4049,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, &env->cur_state); + propagate_liveness(&sl->state, cur); return 1; } sl = sl->next; @@ -3717,16 +4061,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * it will be rejected. Since there are no loops, we won't be * seeing this 'insn_idx' instruction again on the way to bpf_exit */ - new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); + new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; /* add new state to the head of linked list */ - memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); + err = copy_verifier_state(&new_sl->state, cur); + if (err) { + free_verifier_state(&new_sl->state, false); + kfree(new_sl); + return err; + } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - env->cur_state.parent = &new_sl->state; + cur->parent = &new_sl->state; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -3734,10 +4083,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - env->cur_state.regs[i].live = REG_LIVE_NONE; - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) - if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL) - env->cur_state.spilled_regs[i].live = REG_LIVE_NONE; + cur->regs[i].live = REG_LIVE_NONE; + for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) + if (cur->stack[i].slot_type[0] == STACK_SPILL) + cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; return 0; } @@ -3752,29 +4101,31 @@ static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, static int do_check(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; int insn_cnt = env->prog->len; - int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; - init_reg_state(regs); + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + env->cur_state = state; + init_reg_state(state->regs); state->parent = NULL; - insn_idx = 0; for (;;) { struct bpf_insn *insn; u8 class; int err; - if (insn_idx >= insn_cnt) { + if (env->insn_idx >= insn_cnt) { verbose("invalid insn idx %d insn_cnt %d\n", - insn_idx, insn_cnt); + env->insn_idx, insn_cnt); return -EFAULT; } - insn = &insns[insn_idx]; + insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { @@ -3783,17 +4134,19 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, insn_idx); + err = is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { /* found equivalent state, can prune the search */ if (log_level) { if (do_print_state) - verbose("\nfrom %d to %d: safe\n", - prev_insn_idx, insn_idx); + verbose("\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); else - verbose("%d: safe\n", insn_idx); + verbose("%d: safe\n", env->insn_idx); } goto process_bpf_exit; } @@ -3803,24 +4156,27 @@ static int do_check(struct bpf_verifier_env *env) if (log_level > 1 || (log_level && do_print_state)) { if (log_level > 1) - verbose("%d:", insn_idx); + verbose("%d:", env->insn_idx); else - verbose("\nfrom %d to %d:", - prev_insn_idx, insn_idx); - print_verifier_state(&env->cur_state); + verbose("\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); + print_verifier_state(env->cur_state); do_print_state = false; } if (log_level) { - verbose("%d: ", insn_idx); + verbose("%d: ", env->insn_idx); print_bpf_insn(env, insn); } - err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); + err = ext_analyzer_insn_hook(env, env->insn_idx, env->prev_insn_idx); if (err) return err; - env->insn_aux_data[insn_idx].seen = true; + regs = cur_regs(env); + env->insn_aux_data[env->insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -3845,13 +4201,13 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg, false); + err = check_mem_access(env, env->insn_idx, insn->src_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, insn->dst_reg, false); if (err) return err; - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { /* saw a valid insn @@ -3878,10 +4234,10 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn_idx, insn); + err = check_xadd(env, env->insn_idx, insn); if (err) return err; - insn_idx++; + env->insn_idx++; continue; } @@ -3897,13 +4253,13 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, insn->src_reg, false); if (err) return err; - prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; @@ -3932,9 +4288,9 @@ static int do_check(struct bpf_verifier_env *env) } /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - -1, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false); if (err) return err; @@ -3950,7 +4306,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - err = check_call(env, insn->imm, insn_idx); + err = check_call(env, insn->imm, env->insn_idx); if (err) return err; @@ -3963,7 +4319,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - insn_idx += insn->off + 1; + env->insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { @@ -3991,15 +4347,17 @@ static int do_check(struct bpf_verifier_env *env) } process_bpf_exit: - insn_idx = pop_stack(env, &prev_insn_idx); - if (insn_idx < 0) { + err = pop_stack(env, &env->prev_insn_idx, &env->insn_idx); + if (err < 0) { + if (err != -ENOENT) + return err; break; } else { do_print_state = true; continue; } } else { - err = check_cond_jmp_op(env, insn, &insn_idx); + err = check_cond_jmp_op(env, insn, &env->insn_idx); if (err) return err; } @@ -4016,8 +4374,8 @@ process_bpf_exit: if (err) return err; - insn_idx++; - env->insn_aux_data[insn_idx].seen = true; + env->insn_idx++; + env->insn_aux_data[env->insn_idx].seen = true; } else { verbose("invalid BPF_LD mode\n"); return -EINVAL; @@ -4027,7 +4385,7 @@ process_bpf_exit: return -EINVAL; } - insn_idx++; + env->insn_idx++; } verbose("processed %d insns, stack depth %d\n", @@ -4402,6 +4760,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_prog *new_prog; struct bpf_map *map_ptr; int i, cnt, delta = 0; + struct bpf_insn_aux_data *aux; for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || @@ -4422,6 +4781,58 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { + const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; + const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; + struct bpf_insn insn_buf[16]; + struct bpf_insn *patch = &insn_buf[0]; + bool issrc, isneg; + u32 off_reg; + + aux = &env->insn_aux_data[i + delta]; + if (!aux->alu_state || + aux->alu_state == BPF_ALU_NON_POINTER) + continue; + + isneg = aux->alu_state & BPF_ALU_NEG_VALUE; + issrc = (aux->alu_state & BPF_ALU_SANITIZE) == + BPF_ALU_SANITIZE_SRC; + + off_reg = issrc ? insn->src_reg : insn->dst_reg; + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + if (issrc) { + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, + off_reg); + insn->src_reg = BPF_REG_AX; + } else { + *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, + BPF_REG_AX); + } + if (isneg) + insn->code = insn->code == code_add ? + code_sub : code_add; + *patch++ = *insn; + if (issrc && isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; @@ -4557,6 +4968,7 @@ static void free_states(struct bpf_verifier_env *env) if (sl) while (sl != STATE_LIST_MARK) { sln = sl->next; + free_verifier_state(&sl->state, false); kfree(sl); sl = sln; } @@ -4633,9 +5045,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } skip_full_check: - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); free_states(env); if (ret == 0) @@ -4741,9 +5157,13 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } skip_full_check: - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); free_states(env); mutex_unlock(&bpf_verifier_lock); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 109c32c56de7..2c57030f54aa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -187,7 +187,7 @@ static u64 css_serial_nr_next = 1; */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; -static u16 have_free_callback __read_mostly; +static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ @@ -204,7 +204,8 @@ static struct cftype cgroup_base_files[]; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); -static void css_task_iter_advance(struct css_task_iter *it); +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -642,6 +643,7 @@ struct css_set init_css_set = { .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), @@ -737,6 +739,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated) cgroup_update_populated(link->cgrp, populated); } +/* + * @task is leaving, advance task iterators which are pointing to it so + * that they can resume at the next position. Advancing an iterator might + * remove it from the list, use safe walk. See css_task_iter_skip() for + * details. + */ +static void css_set_skip_task_iters(struct css_set *cset, + struct task_struct *task) +{ + struct css_task_iter *it, *pos; + + list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) + css_task_iter_skip(it, task); +} + /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -762,22 +779,9 @@ static void css_set_move_task(struct task_struct *task, css_set_update_populated(to_cset, true); if (from_cset) { - struct css_task_iter *it, *pos; - WARN_ON_ONCE(list_empty(&task->cg_list)); - /* - * @task is leaving, advance task iterators which are - * pointing to it so that they can resume at the next - * position. Advancing an iterator might remove it from - * the list, use safe walk. See css_task_iter_advance*() - * for details. - */ - list_for_each_entry_safe(it, pos, &from_cset->task_iters, - iters_node) - if (it->task_pos == &task->cg_list) - css_task_iter_advance(it); - + css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -1104,6 +1108,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); @@ -1692,7 +1697,7 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) *root_flags = 0; - if (!data) + if (!data || *data == '\0') return 0; while ((token = strsep(&data, ",")) != NULL) { @@ -1942,7 +1947,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_namespace *ns) { struct dentry *dentry; - bool new_sb; + bool new_sb = false; dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); @@ -1952,6 +1957,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, */ if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { struct dentry *nsdentry; + struct super_block *sb = dentry->d_sb; struct cgroup *cgrp; mutex_lock(&cgroup_mutex); @@ -1962,12 +1968,14 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(dentry); + if (IS_ERR(nsdentry)) + deactivate_locked_super(sb); dentry = nsdentry; } - if (IS_ERR(dentry) || !new_sb) + if (!new_sb) cgroup_put(&root->cgrp); return dentry; @@ -4040,15 +4048,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) it->task_pos = NULL; return; } - } while (!css_set_populated(cset)); + } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; - else + else if (!list_empty(&cset->mg_tasks)) it->task_pos = cset->mg_tasks.next; + else + it->task_pos = cset->dying_tasks.next; it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus @@ -4074,9 +4085,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) list_add(&it->iters_node, &cset->task_iters); } +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task) +{ + lockdep_assert_held(&css_set_lock); + + if (it->task_pos == &task->cg_list) { + it->task_pos = it->task_pos->next; + it->flags |= CSS_TASK_ITER_SKIPPED; + } +} + static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *next; + struct task_struct *task; lockdep_assert_held(&css_set_lock); repeat: @@ -4086,25 +4108,40 @@ repeat: * consumed first and then ->mg_tasks. After ->mg_tasks, * we move onto the next cset. */ - next = it->task_pos->next; - - if (next == it->tasks_head) - next = it->mg_tasks_head->next; + if (it->flags & CSS_TASK_ITER_SKIPPED) + it->flags &= ~CSS_TASK_ITER_SKIPPED; + else + it->task_pos = it->task_pos->next; - if (next == it->mg_tasks_head) + if (it->task_pos == it->tasks_head) + it->task_pos = it->mg_tasks_head->next; + if (it->task_pos == it->mg_tasks_head) + it->task_pos = it->dying_tasks_head->next; + if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); - else - it->task_pos = next; } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } - /* if PROCS, skip over tasks which aren't group leaders */ - if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && - !thread_group_leader(list_entry(it->task_pos, struct task_struct, - cg_list))) - goto repeat; + if (!it->task_pos) + return; + + task = list_entry(it->task_pos, struct task_struct, cg_list); + + if (it->flags & CSS_TASK_ITER_PROCS) { + /* if PROCS, skip over tasks which aren't group leaders */ + if (!thread_group_leader(task)) + goto repeat; + + /* and dying leaders w/o live member threads */ + if (!atomic_read(&task->signal->live)) + goto repeat; + } else { + /* skip all dying ones */ + if (task->flags & PF_EXITING) + goto repeat; + } } /** @@ -4160,6 +4197,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) spin_lock_irq(&css_set_lock); + /* @it may be half-advanced by skips, finish advancing */ + if (it->flags & CSS_TASK_ITER_SKIPPED) + css_task_iter_advance(it); + if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); @@ -4543,9 +4584,11 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ trace_cgroup_release(cgrp); + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; + spin_unlock_irq(&css_set_lock); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4742,12 +4785,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->root = root; cgrp->level = level; + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (tcgrp != cgrp) tcgrp->nr_descendants++; } + spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5030,10 +5075,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (parent && cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; } + spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); @@ -5109,7 +5156,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; - have_free_callback |= (bool)ss->free << ss->id; + have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5531,6 +5578,7 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; spin_unlock_irq(&css_set_lock); } else { @@ -5543,16 +5591,26 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_free(struct task_struct *task) +void cgroup_release(struct task_struct *task) { - struct css_set *cset = task_css_set(task); struct cgroup_subsys *ss; int ssid; - do_each_subsys_mask(ss, ssid, have_free_callback) { - ss->free(task); + do_each_subsys_mask(ss, ssid, have_release_callback) { + ss->release(task); } while_each_subsys_mask(); + if (use_task_css_set_links) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } +} + +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); put_css_set(cset); } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4657e2924ecb..0a0e1aa11f5e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2436,10 +2436,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) spin_unlock_irqrestore(&callback_lock, flags); } +/** + * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. + * @tsk: pointer to task_struct with which the scheduler is struggling + * + * Description: In the case that the scheduler cannot find an allowed cpu in + * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy + * mode however, this value is the same as task_cs(tsk)->effective_cpus, + * which will not contain a sane cpumask during cases such as cpu hotplugging. + * This is the absolute last resort for the scheduler and it is only used if + * _every_ other avenue has been traveled. + **/ + void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { rcu_read_lock(); - do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); + do_set_cpus_allowed(tsk, is_in_v2_mode() ? + task_cs(tsk)->cpus_allowed : cpu_possible_mask); rcu_read_unlock(); /* diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 9829c67ebc0a..940d2e8db776 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -48,7 +48,7 @@ struct pids_cgroup { * %PIDS_MAX = (%PID_MAX_LIMIT + 1). */ atomic64_t counter; - int64_t limit; + atomic64_t limit; /* Handle for "pids.events" */ struct cgroup_file events_file; @@ -76,8 +76,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->limit, PIDS_MAX); atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -149,13 +149,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); + int64_t limit = atomic64_read(&p->limit); /* * Since new is capped to the maximum number of pid_t, if * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > p->limit) + if (new > limit) goto revert; } @@ -247,7 +248,7 @@ static void pids_cancel_fork(struct task_struct *task) pids_uncharge(pids, 1); } -static void pids_free(struct task_struct *task) +static void pids_release(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -280,7 +281,7 @@ set_limit: * Limit updates don't need to be mutex'd, since it isn't * critical that any racing fork()s follow the new limit. */ - pids->limit = limit; + atomic64_set(&pids->limit, limit); return nbytes; } @@ -288,7 +289,7 @@ static int pids_max_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct pids_cgroup *pids = css_pids(css); - int64_t limit = pids->limit; + int64_t limit = atomic64_read(&pids->limit); if (limit >= PIDS_MAX) seq_printf(sf, "%s\n", PIDS_MAX_STR); @@ -342,7 +343,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .free = pids_free, + .release = pids_release, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, .threaded = true, diff --git a/kernel/cpu.c b/kernel/cpu.c index 5c907d96e3dd..49273130e4f1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -314,6 +314,15 @@ void cpus_write_unlock(void) void lockdep_assert_cpus_held(void) { + /* + * We can't have hotplug operations before userspace starts running, + * and some init codepaths will knowingly not take the hotplug lock. + * This is all valid, so mute lockdep until it makes sense to report + * unheld locks. + */ + if (system_state < SYSTEM_RUNNING) + return; + percpu_rwsem_assert_held(&cpu_hotplug_lock); } @@ -356,9 +365,6 @@ void __weak arch_smt_update(void) { } #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; -EXPORT_SYMBOL_GPL(cpu_smt_control); - -static bool cpu_smt_available __read_mostly; void __init cpu_smt_disable(bool force) { @@ -370,31 +376,18 @@ void __init cpu_smt_disable(bool force) pr_info("SMT: Force disabled\n"); cpu_smt_control = CPU_SMT_FORCE_DISABLED; } else { + pr_info("SMT: disabled\n"); cpu_smt_control = CPU_SMT_DISABLED; } } /* * The decision whether SMT is supported can only be done after the full - * CPU identification. Called from architecture code before non boot CPUs - * are brought up. - */ -void __init cpu_smt_check_topology_early(void) -{ - if (!topology_smt_supported()) - cpu_smt_control = CPU_SMT_NOT_SUPPORTED; -} - -/* - * If SMT was disabled by BIOS, detect it here, after the CPUs have been - * brought online. This ensures the smt/l1tf sysfs entries are consistent - * with reality. cpu_smt_available is set to true during the bringup of non - * boot CPUs when a SMT sibling is detected. Note, this may overwrite - * cpu_smt_control's previous setting. + * CPU identification. Called from architecture code. */ void __init cpu_smt_check_topology(void) { - if (!cpu_smt_available) + if (!topology_smt_supported()) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; } @@ -407,18 +400,10 @@ early_param("nosmt", smt_cmdline_disable); static inline bool cpu_smt_allowed(unsigned int cpu) { - if (topology_is_primary_thread(cpu)) + if (cpu_smt_control == CPU_SMT_ENABLED) return true; - /* - * If the CPU is not a 'primary' thread and the booted_once bit is - * set then the processor has SMT support. Store this information - * for the late check of SMT support in cpu_smt_check_topology(). - */ - if (per_cpu(cpuhp_state, cpu).booted_once) - cpu_smt_available = true; - - if (cpu_smt_control == CPU_SMT_ENABLED) + if (topology_is_primary_thread(cpu)) return true; /* @@ -563,6 +548,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) } } +static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) +{ + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + /* + * When CPU hotplug is disabled, then taking the CPU down is not + * possible because takedown_cpu() and the architecture and + * subsystem specific mechanisms are not available. So the CPU + * which would be completely unplugged again needs to stay around + * in the current state. + */ + return st->state <= CPUHP_BRINGUP_CPU; +} + static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { @@ -573,8 +572,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { - st->target = prev_state; - undo_cpu_up(cpu, st); + if (can_rollback_cpu(st)) { + st->target = prev_state; + undo_cpu_up(cpu, st); + } break; } } @@ -1944,6 +1945,9 @@ static ssize_t write_cpuhp_fail(struct device *dev, if (ret) return ret; + if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE) + return -EINVAL; + /* * Cannot fail STARTING/DYING callbacks. */ @@ -2054,7 +2058,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; @@ -2088,7 +2092,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } -static int cpuhp_smt_enable(void) +int cpuhp_smt_enable(void) { int cpu, ret = 0; @@ -2297,3 +2301,46 @@ void __init boot_cpu_hotplug_init(void) #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } + +/* + * These are used for a global "mitigations=" cmdline option for toggling + * optional CPU mitigations. + */ +enum cpu_mitigations { + CPU_MITIGATIONS_OFF, + CPU_MITIGATIONS_AUTO, + CPU_MITIGATIONS_AUTO_NOSMT, +}; + +static enum cpu_mitigations cpu_mitigations __ro_after_init = + CPU_MITIGATIONS_AUTO; + +static int __init mitigations_parse_cmdline(char *arg) +{ + if (!strcmp(arg, "off")) + cpu_mitigations = CPU_MITIGATIONS_OFF; + else if (!strcmp(arg, "auto")) + cpu_mitigations = CPU_MITIGATIONS_AUTO; + else if (!strcmp(arg, "auto,nosmt")) + cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; + else + pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", + arg); + + return 0; +} +early_param("mitigations", mitigations_parse_cmdline); + +/* mitigations=off */ +bool cpu_mitigations_off(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_OFF; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_off); + +/* mitigations=auto,nosmt */ +bool cpu_mitigations_auto_nosmt(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); diff --git a/kernel/cred.c b/kernel/cred.c index ecf03657e71c..a9f0f8b21d8c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -147,7 +147,10 @@ void __put_cred(struct cred *cred) BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); - call_rcu(&cred->rcu, put_cred_rcu); + if (cred->non_rcu) + put_cred_rcu(&cred->rcu); + else + call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); @@ -217,7 +220,7 @@ struct cred *cred_alloc_blank(void) new->magic = CRED_MAGIC; #endif - if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) + if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; @@ -258,6 +261,7 @@ struct cred *prepare_creds(void) old = task->cred; memcpy(new, old, sizeof(struct cred)); + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_group_info(new->group_info); @@ -275,7 +279,7 @@ struct cred *prepare_creds(void) new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; validate_creds(new); return new; @@ -448,6 +452,15 @@ int commit_creds(struct cred *new) if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; + /* + * If a task drops privileges and becomes nondumpable, + * the dumpability change must become visible before + * the credential change; otherwise, a __ptrace_may_access() + * racing with this change may be able to attach to a task it + * shouldn't be able to attach to (as if the task had dropped + * privileges without becoming nondumpable). + * Pairs with a read barrier in __ptrace_may_access(). + */ smp_wmb(); } @@ -528,7 +541,19 @@ const struct cred *override_creds(const struct cred *new) validate_creds(old); validate_creds(new); - get_cred(new); + + /* + * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. + * + * That means that we do not clear the 'non_rcu' flag, since + * we are only installing the cred into the thread-synchronous + * '->cred' pointer, not the '->real_cred' pointer that is + * visible to other threads under RCU. + * + * Also note that we did validate_creds() manually, not depending + * on the validation in 'get_cred()'. + */ + get_new_cred((struct cred *)new); alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); alter_cred_subscribers(old, -1); @@ -611,6 +636,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) validate_creds(old); *new = *old; + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); @@ -628,7 +654,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) #ifdef CONFIG_SECURITY new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; put_cred(old); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 65c0f1363788..94aa9ae0007a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -535,6 +535,8 @@ return_normal: arch_kgdb_ops.correct_hw_break(); if (trace_on) tracing_on(); + kgdb_info[cpu].debuggerinfo = NULL; + kgdb_info[cpu].task = NULL; kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; @@ -667,6 +669,8 @@ kgdb_restore: if (trace_on) tracing_on(); + kgdb_info[cpu].debuggerinfo = NULL; + kgdb_info[cpu].task = NULL; kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 7921ae4fca8d..7e2379aa0a1e 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -186,7 +186,16 @@ kdb_bt(int argc, const char **argv) kdb_printf("btc: cpu status: "); kdb_parse("cpu\n"); for_each_online_cpu(cpu) { - sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); + void *kdb_tsk = KDB_TSK(cpu); + + /* If a CPU failed to round up we could be here */ + if (!kdb_tsk) { + kdb_printf("WARNING: no task for cpu %ld\n", + cpu); + continue; + } + + sprintf(buf, "btt 0x%px\n", kdb_tsk); kdb_parse(buf); touch_nmi_watchdog(); } diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 15e1a7af5dd0..53a0df6e4d92 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -118,13 +118,6 @@ int kdb_stub(struct kgdb_state *ks) kdb_bp_remove(); KDB_STATE_CLEAR(DOING_SS); KDB_STATE_SET(PAGER); - /* zero out any offline cpu data */ - for_each_present_cpu(i) { - if (!cpu_online(i)) { - kgdb_info[i].debuggerinfo = NULL; - kgdb_info[i].task = NULL; - } - } if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) { ks->pass_exception = 1; KDB_FLAG_SET(CATASTROPHIC); diff --git a/kernel/elfcore.c b/kernel/elfcore.c index fc482c8e0bd8..57fb4dcff434 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -3,6 +3,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/binfmts.h> +#include <linux/elfcore.h> Elf_Half __weak elf_core_extra_phdrs(void) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 991af683ef9e..ea4f3f7a0c6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -436,18 +436,18 @@ int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - if (ret || !write) - return ret; - + int ret; + int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ - if (sysctl_perf_cpu_time_max_percent == 100 || - sysctl_perf_cpu_time_max_percent == 0) + if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); @@ -4738,6 +4738,11 @@ static void __perf_event_period(struct perf_event *event, } } +static int perf_event_check_period(struct perf_event *event, u64 value) +{ + return event->pmu->check_period(event, value); +} + static int perf_event_period(struct perf_event *event, u64 __user *arg) { u64 value; @@ -4754,6 +4759,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; + if (perf_event_check_period(event, value)) + return -EINVAL; + event_function_call(event, __perf_event_period, &value); return 0; @@ -5622,7 +5630,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (current->mm) { + } else if (!(current->flags & PF_KTHREAD)) { perf_get_regs_user(regs_user, regs, regs_user_copy); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -6915,6 +6923,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; + u32 type = mmap_event->event_id.header.type; int ret; if (!perf_event_mmap_match(event, data)) @@ -6958,6 +6967,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_end(&handle); out: mmap_event->event_id.header.size = size; + mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -8951,6 +8961,11 @@ static int perf_pmu_nop_int(struct pmu *pmu) return 0; } +static int perf_event_nop_int(struct perf_event *event, u64 value) +{ + return 0; +} + static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) @@ -9251,6 +9266,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->check_period) + pmu->check_period = perf_event_nop_int; + if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; @@ -10456,7 +10474,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index c573c7339223..f3a69a4f0d57 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -49,14 +49,30 @@ static void perf_output_put_handle(struct perf_output_handle *handle) unsigned long head; again: + /* + * In order to avoid publishing a head value that goes backwards, + * we must ensure the load of @rb->head happens after we've + * incremented @rb->nest. + * + * Otherwise we can observe a @rb->head value before one published + * by an IRQ/NMI happening between the load and the increment. + */ + barrier(); head = local_read(&rb->head); /* - * IRQ/NMI can happen here, which means we can miss a head update. + * IRQ/NMI can happen here and advance @rb->head, causing our + * load above to be stale. */ - if (!local_dec_and_test(&rb->nest)) + /* + * If this isn't the outermost nesting, we don't have to update + * @rb->user_page->data_head. + */ + if (local_read(&rb->nest) > 1) { + local_dec(&rb->nest); goto out; + } /* * Since the mmap() consumer (userspace) can run on a different CPU: @@ -85,12 +101,21 @@ again: * See perf_output_begin(). */ smp_wmb(); /* B, matches C */ - rb->user_page->data_head = head; + WRITE_ONCE(rb->user_page->data_head, head); /* - * Now check if we missed an update -- rely on previous implied - * compiler barriers to force a re-read. + * We must publish the head before decrementing the nest count, + * otherwise an IRQ/NMI can publish a more recent head value and our + * write will (temporarily) publish a stale value. */ + barrier(); + local_set(&rb->nest, 0); + + /* + * Ensure we decrement @rb->nest before we validate the @rb->head. + * Otherwise we cannot be sure we caught the 'last' nested update. + */ + barrier(); if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); goto again; @@ -464,7 +489,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->aux_flags); } - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) wakeup = true; @@ -495,7 +520,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; @@ -719,6 +744,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct ring_buffer); size += nr_pages * sizeof(void *); + if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) + goto fail; + rb = kzalloc(size, GFP_KERNEL); if (!rb) goto fail; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 01941cffa9c2..c74fc9826250 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1854,7 +1854,7 @@ static void handle_trampoline(struct pt_regs *regs) sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); - force_sig_info(SIGILL, SEND_SIG_FORCED, current); + force_sig(SIGILL, current); } @@ -1970,7 +1970,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); - force_sig_info(SIGILL, SEND_SIG_FORCED, current); + force_sig(SIGILL, current); } } diff --git a/kernel/exit.c b/kernel/exit.c index 3aa01b74c1e3..d1baf9c96c3e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -193,6 +193,7 @@ repeat: rcu_read_unlock(); proc_flush_task(p); + cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -306,7 +307,7 @@ void rcuwait_wake_up(struct rcuwait *w) * MB (A) MB (B) * [L] cond [L] tsk */ - smp_rmb(); /* (B) */ + smp_mb(); /* (B) */ /* * Avoid using task_rcu_dereference() magic as long as we are careful, @@ -496,7 +497,7 @@ static void exit_mm(void) struct mm_struct *mm = current->mm; struct core_state *core_state; - mm_release(current, mm); + exit_mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); @@ -576,10 +577,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father, } write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: father->exit_code); - } list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); @@ -802,32 +799,12 @@ void __noreturn do_exit(long code) */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); - /* - * We can do this unlocked here. The futex code uses - * this flag just to verify whether the pi state - * cleanup has been done or not. In the worst case it - * loops once more. We pretend that the cleanup was - * done as there is no way to return. Either the - * OWNER_DIED bit is set by now or we push the blocked - * task into the wait for ever nirwana as well. - */ - tsk->flags |= PF_EXITPIDONE; + futex_exit_recursive(tsk); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } exit_signals(tsk); /* sets PF_EXITING */ - /* - * Ensure that all new tsk->pi_lock acquisitions must observe - * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). - */ - smp_mb(); - /* - * Ensure that we must observe the pi_state in exit_mm() -> - * mm_release() -> exit_pi_state_list(). - */ - raw_spin_lock_irq(&tsk->pi_lock); - raw_spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", @@ -842,6 +819,14 @@ void __noreturn do_exit(long code) acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { + /* + * If the last thread of global init has exited, panic + * immediately to get a useable coredump. + */ + if (unlikely(is_global_init(tsk))) + panic("Attempted to kill init! exitcode=0x%08x\n", + tsk->signal->group_exit_code ?: (int)code); + #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); @@ -901,12 +886,6 @@ void __noreturn do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(); - /* - * We can do this unlocked here. The futex code uses this flag - * just to verify whether the pi state cleanup has been done - * or not. In the worst case it loops once more. - */ - tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 6d6ce2c3a364..0a328cf0cb13 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -162,10 +162,6 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_stack(unsigned long *stack) -{ -} - #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* @@ -348,7 +344,6 @@ static void release_task_stack(struct task_struct *tsk) return; /* Better to leak the stack than to free prematurely */ account_kernel_stack(tsk, -1); - arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); tsk->stack = NULL; #ifdef CONFIG_VMAP_STACK @@ -415,7 +410,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); cgroup_free(tsk); - task_numa_free(tsk); + task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); @@ -790,6 +785,15 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static __always_inline void mm_clear_owner(struct mm_struct *mm, + struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + if (mm->owner == p) + WRITE_ONCE(mm->owner, NULL); +#endif +} + static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG @@ -1128,24 +1132,8 @@ static int wait_for_vfork_done(struct task_struct *child, * restoring the old one. . . * Eric Biederman 10 January 1998 */ -void mm_release(struct task_struct *tsk, struct mm_struct *mm) +static void mm_release(struct task_struct *tsk, struct mm_struct *mm) { - /* Get rid of any futexes when releasing the mm */ -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) { - exit_robust_list(tsk); - tsk->robust_list = NULL; - } -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) { - compat_exit_robust_list(tsk); - tsk->compat_robust_list = NULL; - } -#endif - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); -#endif - uprobe_free_utask(tsk); /* Get rid of any cached register state */ @@ -1178,6 +1166,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) complete_vfork_done(tsk); } +void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + futex_exit_release(tsk); + mm_release(tsk, mm); +} + +void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + futex_exec_release(tsk); + mm_release(tsk, mm); +} + /* * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. @@ -1211,6 +1211,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; + mm_init_owner(mm, NULL); mmput(mm); fail_nomem: @@ -1528,6 +1529,21 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ } +static void __delayed_free_task(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + free_task(tsk); +} + +static __always_inline void delayed_free_task(struct task_struct *tsk) +{ + if (IS_ENABLED(CONFIG_MEMCG)) + call_rcu(&tsk->rcu, __delayed_free_task); + else + free_task(tsk); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1776,14 +1792,8 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_BLOCK p->plug = NULL; #endif -#ifdef CONFIG_FUTEX - p->robust_list = NULL; -#ifdef CONFIG_COMPAT - p->compat_robust_list = NULL; -#endif - INIT_LIST_HEAD(&p->pi_state_list); - p->pi_state_cache = NULL; -#endif + futex_init_task(p); + /* * sigaltstack should be cleared when sharing the same VM */ @@ -1960,8 +1970,10 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + mm_clear_owner(p->mm, p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -1992,7 +2004,7 @@ bad_fork_cleanup_count: bad_fork_free: p->state = TASK_DEAD; put_task_stack(p); - free_task(p); + delayed_free_task(p); fork_out: return ERR_PTR(retval); } @@ -2472,7 +2484,7 @@ int sysctl_max_threads(struct ctl_table *table, int write, struct ctl_table t; int ret; int threads = max_threads; - int min = MIN_THREADS; + int min = 1; int max = MAX_THREADS; t = *table; @@ -2484,7 +2496,7 @@ int sysctl_max_threads(struct ctl_table *table, int write, if (ret || !write) return ret; - set_max_threads(threads); + max_threads = threads; return 0; } diff --git a/kernel/futex.c b/kernel/futex.c index 046cd780d057..f5aae14c247b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -44,6 +44,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/compat.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/fs.h> @@ -173,8 +174,10 @@ * double_lock_hb() and double_unlock_hb(), respectively. */ -#ifndef CONFIG_HAVE_FUTEX_CMPXCHG -int __read_mostly futex_cmpxchg_enabled; +#ifdef CONFIG_HAVE_FUTEX_CMPXCHG +#define futex_cmpxchg_enabled 1 +#else +static int __read_mostly futex_cmpxchg_enabled; #endif /* @@ -338,6 +341,12 @@ static inline bool should_fail_futex(bool fshared) } #endif /* CONFIG_FAIL_FUTEX */ +#ifdef CONFIG_COMPAT +static void compat_exit_robust_list(struct task_struct *curr); +#else +static inline void compat_exit_robust_list(struct task_struct *curr) { } +#endif + static inline void futex_get_mm(union futex_key *key) { mmgrab(key->private.mm); @@ -887,7 +896,7 @@ static struct task_struct *futex_find_get_task(pid_t pid) * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ -void exit_pi_state_list(struct task_struct *curr) +static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; @@ -957,7 +966,8 @@ void exit_pi_state_list(struct task_struct *curr) } raw_spin_unlock_irq(&curr->pi_lock); } - +#else +static inline void exit_pi_state_list(struct task_struct *curr) { } #endif /* @@ -1166,12 +1176,99 @@ out_error: return ret; } +/** + * wait_for_owner_exiting - Block until the owner has exited + * @exiting: Pointer to the exiting task + * + * Caller must hold a refcount on @exiting. + */ +static void wait_for_owner_exiting(int ret, struct task_struct *exiting) +{ + if (ret != -EBUSY) { + WARN_ON_ONCE(exiting); + return; + } + + if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) + return; + + mutex_lock(&exiting->futex_exit_mutex); + /* + * No point in doing state checking here. If the waiter got here + * while the task was in exec()->exec_futex_release() then it can + * have any FUTEX_STATE_* value when the waiter has acquired the + * mutex. OK, if running, EXITING or DEAD if it reached exit() + * already. Highly unlikely and not a problem. Just one more round + * through the futex maze. + */ + mutex_unlock(&exiting->futex_exit_mutex); + + put_task_struct(exiting); +} + +static int handle_exit_race(u32 __user *uaddr, u32 uval, + struct task_struct *tsk) +{ + u32 uval2; + + /* + * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the + * caller that the alleged owner is busy. + */ + if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) + return -EBUSY; + + /* + * Reread the user space value to handle the following situation: + * + * CPU0 CPU1 + * + * sys_exit() sys_futex() + * do_exit() futex_lock_pi() + * futex_lock_pi_atomic() + * exit_signals(tsk) No waiters: + * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID + * mm_release(tsk) Set waiter bit + * exit_robust_list(tsk) { *uaddr = 0x80000PID; + * Set owner died attach_to_pi_owner() { + * *uaddr = 0xC0000000; tsk = get_task(PID); + * } if (!tsk->flags & PF_EXITING) { + * ... attach(); + * tsk->futex_state = } else { + * FUTEX_STATE_DEAD; if (tsk->futex_state != + * FUTEX_STATE_DEAD) + * return -EAGAIN; + * return -ESRCH; <--- FAIL + * } + * + * Returning ESRCH unconditionally is wrong here because the + * user space value has been changed by the exiting task. + * + * The same logic applies to the case where the exiting task is + * already gone. + */ + if (get_futex_value_locked(&uval2, uaddr)) + return -EFAULT; + + /* If the user space value has changed, try again. */ + if (uval2 != uval) + return -EAGAIN; + + /* + * The exiting task did not have a robust list, the robust list was + * corrupted or the user space value in *uaddr is simply bogus. + * Give up and tell user space. + */ + return -ESRCH; +} + /* * Lookup the task for the TID provided from user space and attach to * it after doing proper sanity checks. */ -static int attach_to_pi_owner(u32 uval, union futex_key *key, - struct futex_pi_state **ps) +static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, + struct futex_pi_state **ps, + struct task_struct **exiting) { pid_t pid = uval & FUTEX_TID_MASK; struct futex_pi_state *pi_state; @@ -1180,12 +1277,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, /* * We are the first waiter - try to look up the real owner and attach * the new pi_state to it, but bail out when TID = 0 [1] + * + * The !pid check is paranoid. None of the call sites should end up + * with pid == 0, but better safe than sorry. Let the caller retry */ if (!pid) - return -ESRCH; + return -EAGAIN; p = futex_find_get_task(pid); if (!p) - return -ESRCH; + return handle_exit_race(uaddr, uval, NULL); if (unlikely(p->flags & PF_KTHREAD)) { put_task_struct(p); @@ -1193,22 +1293,33 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, } /* - * We need to look at the task state flags to figure out, - * whether the task is exiting. To protect against the do_exit - * change of the task flags, we do this protected by - * p->pi_lock: + * We need to look at the task state to figure out, whether the + * task is exiting. To protect against the change of the task state + * in futex_exit_release(), we do this protected by p->pi_lock: */ raw_spin_lock_irq(&p->pi_lock); - if (unlikely(p->flags & PF_EXITING)) { + if (unlikely(p->futex_state != FUTEX_STATE_OK)) { /* - * The task is on the way out. When PF_EXITPIDONE is - * set, we know that the task has finished the - * cleanup: + * The task is on the way out. When the futex state is + * FUTEX_STATE_DEAD, we know that the task has finished + * the cleanup: */ - int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; + int ret = handle_exit_race(uaddr, uval, p); raw_spin_unlock_irq(&p->pi_lock); - put_task_struct(p); + /* + * If the owner task is between FUTEX_STATE_EXITING and + * FUTEX_STATE_DEAD then store the task pointer and keep + * the reference on the task struct. The calling code will + * drop all locks, wait for the task to reach + * FUTEX_STATE_DEAD and then drop the refcount. This is + * required to prevent a live lock when the current task + * preempted the exiting task between the two states. + */ + if (ret == -EBUSY) + *exiting = p; + else + put_task_struct(p); return ret; } @@ -1247,7 +1358,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, static int lookup_pi_state(u32 __user *uaddr, u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) + union futex_key *key, struct futex_pi_state **ps, + struct task_struct **exiting) { struct futex_q *top_waiter = futex_top_waiter(hb, key); @@ -1262,18 +1374,20 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, * We are the first waiter - try to look up the owner based on * @uval and attach to it. */ - return attach_to_pi_owner(uval, key, ps); + return attach_to_pi_owner(uaddr, uval, key, ps, exiting); } static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { + int err; u32 uninitialized_var(curval); if (unlikely(should_fail_futex(true))) return -EFAULT; - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) - return -EFAULT; + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (unlikely(err)) + return err; /* If user space value changed, let the caller retry */ return curval != uval ? -EAGAIN : 0; @@ -1288,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * lookup * @task: the task to perform the atomic lock work for. This will * be "current" except in the case of requeue pi. + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Return: @@ -1296,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * - <0 - error * * The hb->lock and futex_key refs shall be held by the caller. + * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. */ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, - struct task_struct *task, int set_waiters) + struct task_struct *task, + struct task_struct **exiting, + int set_waiters) { u32 uval, newval, vpid = task_pid_vnr(task); struct futex_q *top_waiter; @@ -1370,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * attach to the owner. If that fails, no harm done, we only * set the FUTEX_WAITERS bit in the user space variable. */ - return attach_to_pi_owner(uval, key, ps); + return attach_to_pi_owner(uaddr, newval, key, ps, exiting); } /** @@ -1405,11 +1527,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return; - /* - * Queue the task for later wakeup for after we've released - * the hb->lock. wake_q_add() grabs reference to p. - */ - wake_q_add(wake_q, p); + get_task_struct(p); __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL @@ -1419,6 +1537,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) * plist_del in __unqueue_futex(). */ smp_store_release(&q->lock_ptr, NULL); + + /* + * Queue the task for later wakeup for after we've released + * the hb->lock. wake_q_add() grabs reference to p. + */ + wake_q_add(wake_q, p); + put_task_struct(p); } /* @@ -1456,10 +1581,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ if (unlikely(should_fail_futex(true))) ret = -EFAULT; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { - ret = -EFAULT; - - } else if (curval != uval) { + ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (!ret && (curval != uval)) { /* * If a unconditional UNLOCK_PI operation (user space did not * try the TID->0 transition) raced with a waiter setting the @@ -1654,32 +1777,32 @@ retry_private: double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - double_unlock_hb(hb1, hb2); -#ifndef CONFIG_MMU - /* - * we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking - */ - ret = op_ret; - goto out_put_keys; -#endif - - if (unlikely(op_ret != -EFAULT)) { + if (!IS_ENABLED(CONFIG_MMU) || + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { + /* + * we don't get EFAULT from MMU faults if we don't have + * an MMU, but we might get them from range checking + */ ret = op_ret; goto out_put_keys; } - ret = fault_in_user_writeable(uaddr2); - if (ret) - goto out_put_keys; + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) + goto out_put_keys; + } - if (!(flags & FLAGS_SHARED)) + if (!(flags & FLAGS_SHARED)) { + cond_resched(); goto retry_private; + } put_futex_key(&key2); put_futex_key(&key1); + cond_resched(); goto retry; } @@ -1788,6 +1911,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * @key1: the from futex key * @key2: the to futex key * @ps: address to store the pi_state pointer + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Try and get the lock on behalf of the top waiter if we can do it atomically. @@ -1795,16 +1920,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. + * * Return: * - 0 - failed to acquire the lock atomically; * - >0 - acquired the lock, return value is vpid of the top_waiter * - <0 - error */ -static int futex_proxy_trylock_atomic(u32 __user *pifutex, - struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, - union futex_key *key1, union futex_key *key2, - struct futex_pi_state **ps, int set_waiters) +static int +futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key1, + union futex_key *key2, struct futex_pi_state **ps, + struct task_struct **exiting, int set_waiters) { struct futex_q *top_waiter = NULL; u32 curval; @@ -1841,7 +1970,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, */ vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, - set_waiters); + exiting, set_waiters); if (ret == 1) { requeue_pi_wake_futex(top_waiter, key2, hb2); return vpid; @@ -1970,6 +2099,8 @@ retry_private: } if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + struct task_struct *exiting = NULL; + /* * Attempt to acquire uaddr2 and wake the top waiter. If we * intend to requeue waiters, force setting the FUTEX_WAITERS @@ -1977,7 +2108,8 @@ retry_private: * faults rather in the requeue loop below. */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, nr_requeue); + &key2, &pi_state, + &exiting, nr_requeue); /* * At this point the top_waiter has either taken uaddr2 or is @@ -2004,7 +2136,8 @@ retry_private: * If that call succeeds then we have pi_state and an * initial refcount on it. */ - ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, + &pi_state, &exiting); } switch (ret) { @@ -2022,17 +2155,24 @@ retry_private: if (!ret) goto retry; goto out; + case -EBUSY: case -EAGAIN: /* * Two reasons for this: - * - Owner is exiting and we just wait for the + * - EBUSY: Owner is exiting and we just wait for the * exit to complete. - * - The user space value changed. + * - EAGAIN: The user space value changed. */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); cond_resched(); goto retry; default: @@ -2304,7 +2444,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, u32 uval, uninitialized_var(curval), newval; struct task_struct *oldowner, *newowner; u32 newtid; - int ret; + int ret, err = 0; lockdep_assert_held(q->lock_ptr); @@ -2375,14 +2515,17 @@ retry: if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; - if (get_futex_value_locked(&uval, uaddr)) - goto handle_fault; + err = get_futex_value_locked(&uval, uaddr); + if (err) + goto handle_err; for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - goto handle_fault; + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (err) + goto handle_err; + if (curval == uval) break; uval = curval; @@ -2410,23 +2553,37 @@ retry: return 0; /* - * To handle the page fault we need to drop the locks here. That gives - * the other task (either the highest priority waiter itself or the - * task which stole the rtmutex) the chance to try the fixup of the - * pi_state. So once we are back from handling the fault we need to - * check the pi_state after reacquiring the locks and before trying to - * do another fixup. When the fixup has been done already we simply - * return. + * In order to reschedule or handle a page fault, we need to drop the + * locks here. In the case of a fault, this gives the other task + * (either the highest priority waiter itself or the task which stole + * the rtmutex) the chance to try the fixup of the pi_state. So once we + * are back from handling the fault we need to check the pi_state after + * reacquiring the locks and before trying to do another fixup. When + * the fixup has been done already we simply return. * * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely * drop hb->lock since the caller owns the hb -> futex_q relation. * Dropping the pi_mutex->wait_lock requires the state revalidate. */ -handle_fault: +handle_err: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(q->lock_ptr); - ret = fault_in_user_writeable(uaddr); + switch (err) { + case -EFAULT: + ret = fault_in_user_writeable(uaddr); + break; + + case -EAGAIN: + cond_resched(); + ret = 0; + break; + + default: + WARN_ON_ONCE(1); + ret = err; + break; + } spin_lock(q->lock_ptr); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); @@ -2731,6 +2888,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to = NULL; struct futex_pi_state *pi_state = NULL; + struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; @@ -2758,7 +2916,8 @@ retry: retry_private: hb = queue_lock(&q); - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, + &exiting, 0); if (unlikely(ret)) { /* * Atomic work succeeded and we got the lock, @@ -2771,15 +2930,22 @@ retry_private: goto out_unlock_put_key; case -EFAULT: goto uaddr_faulted; + case -EBUSY: case -EAGAIN: /* * Two reasons for this: - * - Task is exiting and we just wait for the + * - EBUSY: Task is exiting and we just wait for the * exit to complete. - * - The user space value changed. + * - EAGAIN: The user space value changed. */ queue_unlock(hb); put_futex_key(&q.key); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); cond_resched(); goto retry; default: @@ -2811,35 +2977,39 @@ retry_private: * and BUG when futex_unlock_pi() interleaves with this. * * Therefore acquire wait_lock while holding hb->lock, but drop the - * latter before calling rt_mutex_start_proxy_lock(). This still fully - * serializes against futex_unlock_pi() as that does the exact same - * lock handoff sequence. + * latter before calling __rt_mutex_start_proxy_lock(). This + * interleaves with futex_unlock_pi() -- which does a similar lock + * handoff -- such that the latter can observe the futex_q::pi_state + * before __rt_mutex_start_proxy_lock() is done. */ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); spin_unlock(q.lock_ptr); + /* + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter + * such that futex_unlock_pi() is guaranteed to observe the waiter when + * it sees the futex_q::pi_state. + */ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); if (ret) { if (ret == 1) ret = 0; - - spin_lock(q.lock_ptr); - goto no_block; + goto cleanup; } - if (unlikely(to)) hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); +cleanup: spin_lock(q.lock_ptr); /* - * If we failed to acquire the lock (signal/timeout), we must + * If we failed to acquire the lock (deadlock/signal/timeout), we must * first acquire the hb->lock before removing the lock from the - * rt_mutex waitqueue, such that we can keep the hb and rt_mutex - * wait lists consistent. + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait + * lists consistent. * * In particular; it is important that futex_unlock_pi() can not * observe this inconsistency. @@ -2963,6 +3133,10 @@ retry: * there is no point where we hold neither; and therefore * wake_futex_pi() must observe a state consistent with what we * observed. + * + * In particular; this forces __rt_mutex_start_proxy() to + * complete such that we're guaranteed to observe the + * rt_waiter. Also see the WARN in wake_futex_pi(). */ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); @@ -2987,10 +3161,8 @@ retry: * A unconditional UNLOCK_PI op raced against a waiter * setting the FUTEX_WAITERS bit. Try again. */ - if (ret == -EAGAIN) { - put_futex_key(&key); - goto retry; - } + if (ret == -EAGAIN) + goto pi_retry; /* * wake_futex_pi has detected invalid state. Tell user * space. @@ -3005,9 +3177,19 @@ retry: * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { spin_unlock(&hb->lock); - goto pi_faulted; + switch (ret) { + case -EFAULT: + goto pi_faulted; + + case -EAGAIN: + goto pi_retry; + + default: + WARN_ON_ONCE(1); + goto out_putkey; + } } /* @@ -3021,6 +3203,11 @@ out_putkey: put_futex_key(&key); return ret; +pi_retry: + put_futex_key(&key); + cond_resched(); + goto retry; + pi_faulted: put_futex_key(&key); @@ -3374,54 +3561,115 @@ err_unlock: return ret; } +/* Constants for the pending_op argument of handle_futex_death */ +#define HANDLE_DEATH_PENDING true +#define HANDLE_DEATH_LIST false + /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) +static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, + bool pi, bool pending_op) { u32 uval, uninitialized_var(nval), mval; + int err; + + /* Futex address must be 32bit aligned */ + if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) + return -1; retry: if (get_user(uval, uaddr)) return -1; - if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { - /* - * Ok, this dying thread is truly holding a futex - * of interest. Set the OWNER_DIED bit atomically - * via cmpxchg, and if the value had FUTEX_WAITERS - * set, wake up a waiter (if any). (We have to do a - * futex_wake() even if OWNER_DIED is already set - - * to handle the rare but possible case of recursive - * thread-death.) The rest of the cleanup is done in - * userspace. - */ - mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - /* - * We are not holding a lock here, but we want to have - * the pagefault_disable/enable() protection because - * we want to handle the fault gracefully. If the - * access fails we try to fault in the futex with R/W - * verification via get_user_pages. get_user() above - * does not guarantee R/W access. If that fails we - * give up and leave the futex locked. - */ - if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + /* + * Special case for regular (non PI) futexes. The unlock path in + * user space has two race scenarios: + * + * 1. The unlock path releases the user space futex value and + * before it can execute the futex() syscall to wake up + * waiters it is killed. + * + * 2. A woken up waiter is killed before it can acquire the + * futex in user space. + * + * In both cases the TID validation below prevents a wakeup of + * potential waiters which can cause these waiters to block + * forever. + * + * In both cases the following conditions are met: + * + * 1) task->robust_list->list_op_pending != NULL + * @pending_op == true + * 2) User space futex value == 0 + * 3) Regular futex: @pi == false + * + * If these conditions are met, it is safe to attempt waking up a + * potential waiter without touching the user space futex value and + * trying to set the OWNER_DIED bit. The user space futex value is + * uncontended and the rest of the user space mutex state is + * consistent, so a woken waiter will just take over the + * uncontended futex. Setting the OWNER_DIED bit would create + * inconsistent state and malfunction of the user space owner died + * handling. + */ + if (pending_op && !pi && !uval) { + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + return 0; + } + + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + return 0; + + /* + * Ok, this dying thread is truly holding a futex + * of interest. Set the OWNER_DIED bit atomically + * via cmpxchg, and if the value had FUTEX_WAITERS + * set, wake up a waiter (if any). (We have to do a + * futex_wake() even if OWNER_DIED is already set - + * to handle the rare but possible case of recursive + * thread-death.) The rest of the cleanup is done in + * userspace. + */ + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { + switch (err) { + case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; - } - if (nval != uval) + + case -EAGAIN: + cond_resched(); goto retry; - /* - * Wake robust non-PI futexes here. The wakeup of - * PI futexes happens in exit_pi_state(): - */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + default: + WARN_ON_ONCE(1); + return err; + } } + + if (nval != uval) + goto retry; + + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi && (uval & FUTEX_WAITERS)) + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + return 0; } @@ -3449,7 +3697,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, * * We silently return on any sign of list-walking problem. */ -void exit_robust_list(struct task_struct *curr) +static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; @@ -3490,10 +3738,11 @@ void exit_robust_list(struct task_struct *curr) * A pending lock might already be on the list, so * don't process it twice: */ - if (entry != pending) + if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, - curr, pi)) + curr, pi, HANDLE_DEATH_LIST)) return; + } if (rc) return; entry = next_entry; @@ -3507,9 +3756,118 @@ void exit_robust_list(struct task_struct *curr) cond_resched(); } - if (pending) + if (pending) { handle_futex_death((void __user *)pending + futex_offset, - curr, pip); + curr, pip, HANDLE_DEATH_PENDING); + } +} + +static void futex_cleanup(struct task_struct *tsk) +{ + if (unlikely(tsk->robust_list)) { + exit_robust_list(tsk); + tsk->robust_list = NULL; + } + +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) { + compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } +#endif + + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); +} + +/** + * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD + * @tsk: task to set the state on + * + * Set the futex exit state of the task lockless. The futex waiter code + * observes that state when a task is exiting and loops until the task has + * actually finished the futex cleanup. The worst case for this is that the + * waiter runs through the wait loop until the state becomes visible. + * + * This is called from the recursive fault handling path in do_exit(). + * + * This is best effort. Either the futex exit code has run already or + * not. If the OWNER_DIED bit has been set on the futex then the waiter can + * take it over. If not, the problem is pushed back to user space. If the + * futex exit code did not run yet, then an already queued waiter might + * block forever, but there is nothing which can be done about that. + */ +void futex_exit_recursive(struct task_struct *tsk) +{ + /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ + if (tsk->futex_state == FUTEX_STATE_EXITING) + mutex_unlock(&tsk->futex_exit_mutex); + tsk->futex_state = FUTEX_STATE_DEAD; +} + +static void futex_cleanup_begin(struct task_struct *tsk) +{ + /* + * Prevent various race issues against a concurrent incoming waiter + * including live locks by forcing the waiter to block on + * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in + * attach_to_pi_owner(). + */ + mutex_lock(&tsk->futex_exit_mutex); + + /* + * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. + * + * This ensures that all subsequent checks of tsk->futex_state in + * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with + * tsk->pi_lock held. + * + * It guarantees also that a pi_state which was queued right before + * the state change under tsk->pi_lock by a concurrent waiter must + * be observed in exit_pi_state_list(). + */ + raw_spin_lock_irq(&tsk->pi_lock); + tsk->futex_state = FUTEX_STATE_EXITING; + raw_spin_unlock_irq(&tsk->pi_lock); +} + +static void futex_cleanup_end(struct task_struct *tsk, int state) +{ + /* + * Lockless store. The only side effect is that an observer might + * take another loop until it becomes visible. + */ + tsk->futex_state = state; + /* + * Drop the exit protection. This unblocks waiters which observed + * FUTEX_STATE_EXITING to reevaluate the state. + */ + mutex_unlock(&tsk->futex_exit_mutex); +} + +void futex_exec_release(struct task_struct *tsk) +{ + /* + * The state handling is done for consistency, but in the case of + * exec() there is no way to prevent futher damage as the PID stays + * the same. But for the unlikely and arguably buggy case that a + * futex is held on exec(), this provides at least as much state + * consistency protection which is possible. + */ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + /* + * Reset the state to FUTEX_STATE_OK. The task is alive and about + * exec a new binary. + */ + futex_cleanup_end(tsk, FUTEX_STATE_OK); +} + +void futex_exit_release(struct task_struct *tsk) +{ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, @@ -3605,6 +3963,193 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } +#ifdef CONFIG_COMPAT +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t __user *head, unsigned int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + +static void __user *futex_uaddr(struct robust_list __user *entry, + compat_long_t futex_offset) +{ + compat_uptr_t base = ptr_to_compat(entry); + void __user *uaddr = compat_ptr(base + futex_offset); + + return uaddr; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +static void compat_exit_robust_list(struct task_struct *curr) +{ + struct compat_robust_list_head __user *head = curr->compat_robust_list; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int uninitialized_var(next_pi); + compat_uptr_t uentry, next_uentry, upending; + compat_long_t futex_offset; + int rc; + + if (!futex_cmpxchg_enabled) + return; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (compat_fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pip)) + return; + + next_entry = NULL; /* avoid warning with gcc */ + while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = compat_fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); + /* + * A pending lock might already be on the list, so + * dont process it twice: + */ + if (entry != pending) { + void __user *uaddr = futex_uaddr(entry, futex_offset); + + if (handle_futex_death(uaddr, curr, pi, + HANDLE_DEATH_LIST)) + return; + } + if (rc) + return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } + if (pending) { + void __user *uaddr = futex_uaddr(pending, futex_offset); + + handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); + } +} + +COMPAT_SYSCALL_DEFINE2(set_robust_list, + struct compat_robust_list_head __user *, head, + compat_size_t, len) +{ + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->compat_robust_list = head; + + return 0; +} + +COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + compat_uptr_t __user *, head_ptr, + compat_size_t __user *, len_ptr) +{ + struct compat_robust_list_head __user *head; + unsigned long ret; + struct task_struct *p; + + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + rcu_read_lock(); + + ret = -ESRCH; + if (!pid) + p = current; + else { + p = find_task_by_vpid(pid); + if (!p) + goto err_unlock; + } + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) + goto err_unlock; + + head = p->compat_robust_list; + rcu_read_unlock(); + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(ptr_to_compat(head), head_ptr); + +err_unlock: + rcu_read_unlock(); + + return ret; +} + +COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct compat_timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) +{ + struct timespec ts; + ktime_t t, *tp = NULL; + int val2 = 0; + int cmd = op & FUTEX_CMD_MASK; + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (compat_get_timespec(&ts, utime)) + return -EFAULT; + if (!timespec_valid(&ts)) + return -EINVAL; + + t = timespec_to_ktime(ts); + if (cmd == FUTEX_WAIT) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (int) (unsigned long) utime; + + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); +} +#endif /* CONFIG_COMPAT */ + static void __init futex_detect_cmpxchg(void) { #ifndef CONFIG_HAVE_FUTEX_CMPXCHG diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c deleted file mode 100644 index 83f830acbb5f..000000000000 --- a/kernel/futex_compat.c +++ /dev/null @@ -1,202 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/kernel/futex_compat.c - * - * Futex compatibililty routines. - * - * Copyright 2006, Red Hat, Inc., Ingo Molnar - */ - -#include <linux/linkage.h> -#include <linux/compat.h> -#include <linux/nsproxy.h> -#include <linux/futex.h> -#include <linux/ptrace.h> -#include <linux/syscalls.h> - -#include <linux/uaccess.h> - - -/* - * Fetch a robust-list pointer. Bit 0 signals PI futexes: - */ -static inline int -fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, unsigned int *pi) -{ - if (get_user(*uentry, head)) - return -EFAULT; - - *entry = compat_ptr((*uentry) & ~1); - *pi = (unsigned int)(*uentry) & 1; - - return 0; -} - -static void __user *futex_uaddr(struct robust_list __user *entry, - compat_long_t futex_offset) -{ - compat_uptr_t base = ptr_to_compat(entry); - void __user *uaddr = compat_ptr(base + futex_offset); - - return uaddr; -} - -/* - * Walk curr->robust_list (very carefully, it's a userspace list!) - * and mark any locks found there dead, and notify any waiters. - * - * We silently return on any sign of list-walking problem. - */ -void compat_exit_robust_list(struct task_struct *curr) -{ - struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); - compat_uptr_t uentry, next_uentry, upending; - compat_long_t futex_offset; - int rc; - - if (!futex_cmpxchg_enabled) - return; - - /* - * Fetch the list head (which was registered earlier, via - * sys_set_robust_list()): - */ - if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) - return; - /* - * Fetch the relative futex offset: - */ - if (get_user(futex_offset, &head->futex_offset)) - return; - /* - * Fetch any possibly pending lock-add first, and handle it - * if it exists: - */ - if (fetch_robust_entry(&upending, &pending, - &head->list_op_pending, &pip)) - return; - - next_entry = NULL; /* avoid warning with gcc */ - while (entry != (struct robust_list __user *) &head->list) { - /* - * Fetch the next entry in the list before calling - * handle_futex_death: - */ - rc = fetch_robust_entry(&next_uentry, &next_entry, - (compat_uptr_t __user *)&entry->next, &next_pi); - /* - * A pending lock might already be on the list, so - * dont process it twice: - */ - if (entry != pending) { - void __user *uaddr = futex_uaddr(entry, futex_offset); - - if (handle_futex_death(uaddr, curr, pi)) - return; - } - if (rc) - return; - uentry = next_uentry; - entry = next_entry; - pi = next_pi; - /* - * Avoid excessively long or circular lists: - */ - if (!--limit) - break; - - cond_resched(); - } - if (pending) { - void __user *uaddr = futex_uaddr(pending, futex_offset); - - handle_futex_death(uaddr, curr, pip); - } -} - -COMPAT_SYSCALL_DEFINE2(set_robust_list, - struct compat_robust_list_head __user *, head, - compat_size_t, len) -{ - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - if (unlikely(len != sizeof(*head))) - return -EINVAL; - - current->compat_robust_list = head; - - return 0; -} - -COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - compat_uptr_t __user *, head_ptr, - compat_size_t __user *, len_ptr) -{ - struct compat_robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; - - head = p->compat_robust_list; - rcu_read_unlock(); - - if (put_user(sizeof(*head), len_ptr)) - return -EFAULT; - return put_user(ptr_to_compat(head), head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; -} - -COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct compat_timespec __user *, utime, u32 __user *, uaddr2, - u32, val3) -{ - struct timespec ts; - ktime_t t, *tp = NULL; - int val2 = 0; - int cmd = op & FUTEX_CMD_MASK; - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { - if (compat_get_timespec(&ts, utime)) - return -EFAULT; - if (!timespec_valid(&ts)) - return -EINVAL; - - t = timespec_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -} diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 32b479468e4d..2e4869fa66c9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -15,6 +15,7 @@ #include <linux/lockdep.h> #include <linux/export.h> #include <linux/sysctl.h> +#include <linux/suspend.h> #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> @@ -33,7 +34,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * is disabled during the critical section. It also controls the size of * the RCU grace period. So it needs to be upper-bound. */ -#define HUNG_TASK_BATCHING 1024 +#define HUNG_TASK_LOCK_BREAK (HZ / 10) /* * Zero means infinite timeout - no checking done: @@ -103,8 +104,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) trace_sched_process_hang(t); - if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) - return; + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_show_lock = true; + hung_task_call_panic = true; + } /* * Ok, the task did not get scheduled for more than 2 minutes, @@ -126,11 +130,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) } touch_nmi_watchdog(); - - if (sysctl_hung_task_panic) { - hung_task_show_lock = true; - hung_task_call_panic = true; - } } /* @@ -164,7 +163,7 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; - int batch_count = HUNG_TASK_BATCHING; + unsigned long last_break = jiffies; struct task_struct *g, *t; /* @@ -179,10 +178,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) for_each_process_thread(g, t) { if (!max_count--) goto unlock; - if (!--batch_count) { - batch_count = HUNG_TASK_BATCHING; + if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { if (!rcu_lock_break(g, t)) goto unlock; + last_break = jiffies; } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) @@ -234,6 +233,28 @@ void reset_hung_task_detector(void) } EXPORT_SYMBOL_GPL(reset_hung_task_detector); +static bool hung_detector_suspended; + +static int hungtask_pm_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + hung_detector_suspended = true; + break; + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + case PM_POST_RESTORE: + hung_detector_suspended = false; + break; + default: + break; + } + return NOTIFY_OK; +} + /* * kthread which checks for tasks stuck in D state */ @@ -248,7 +269,8 @@ static int watchdog(void *dummy) long t = hung_timeout_jiffies(hung_last_checked, timeout); if (t <= 0) { - if (!atomic_xchg(&reset_hung_task, 0)) + if (!atomic_xchg(&reset_hung_task, 0) && + !hung_detector_suspended) check_hung_uninterruptible_tasks(timeout); hung_last_checked = jiffies; continue; @@ -262,6 +284,10 @@ static int watchdog(void *dummy) static int __init hung_task_init(void) { atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + /* Disable hung task detector on suspend */ + pm_notifier(hungtask_pm_notify, 0); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); return 0; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5a2ef92c2782..317fc759de76 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -834,7 +834,11 @@ void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -863,7 +867,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc) unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -1355,6 +1363,10 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) { data = data->parent_data; + + if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE) + return 0; + if (data->chip->irq_set_wake) return data->chip->irq_set_wake(data, on); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 44ed5f8c8759..4ef7f3b820ce 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -240,12 +240,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc) #undef __irqd_to_state -static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } +static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +{ + __kstat_incr_irqs_this_cpu(desc); + desc->tot_count++; +} + static inline int irq_desc_get_node(struct irq_desc *desc) { return irq_common_data_get_node(&desc->irq_common_data); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index e97bbae947f0..92784b290564 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; + desc->tot_count = 0; desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) @@ -276,6 +277,18 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc) } } +static void irq_sysfs_del(struct irq_desc *desc) +{ + /* + * If irq_sysfs_init() has not yet been invoked (early boot), then + * irq_kobj_base is NULL and the descriptor was never added. + * kobject_del() complains about a object with no parent, so make + * it conditional. + */ + if (irq_kobj_base) + kobject_del(&desc->kobj); +} + static int __init irq_sysfs_init(void) { struct irq_desc *desc; @@ -306,6 +319,7 @@ static struct kobj_type irq_kobj_type = { }; static void irq_sysfs_add(int irq, struct irq_desc *desc) {} +static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ @@ -419,7 +433,7 @@ static void free_desc(unsigned int irq) * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ - kobject_del(&desc->kobj); + irq_sysfs_del(desc); delete_irq_desc(irq); /* @@ -534,6 +548,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + mutex_init(&desc[i].request_mutex); desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); @@ -895,11 +910,15 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - int cpu; unsigned int sum = 0; + int cpu; if (!desc || !desc->kstat_irqs) return 0; + if (!irq_settings_is_per_cpu_devid(desc) && + !irq_settings_is_per_cpu(desc)) + return desc->tot_count; + for_each_possible_cpu(cpu) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4cd85870f00e..9c86a3e45110 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,8 +323,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); - if (old_notify) + if (old_notify) { + cancel_work_sync(&old_notify->work); kref_put(&old_notify->kref, old_notify->release); + } return 0; } @@ -360,6 +362,9 @@ int irq_setup_affinity(struct irq_desc *desc) } cpumask_and(&mask, cpu_online_mask, set); + if (cpumask_empty(&mask)) + cpumask_copy(&mask, cpu_online_mask); + if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 1d08f45135c2..f7a9cfe63380 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -38,6 +38,8 @@ static void resend_irqs(unsigned long arg) irq = find_first_bit(irqs_resend, nr_irqs); clear_bit(irq, irqs_resend); desc = irq_to_desc(irq); + if (!desc) + continue; local_irq_disable(); desc->handle_irq(desc); local_irq_enable(); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 127e7cfafa55..3e1b66366ac2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -296,8 +296,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, { char namebuf[KSYM_NAME_LEN]; - if (is_ksym_addr(addr)) - return !!get_symbol_pos(addr, symbolsize, offset); + if (is_ksym_addr(addr)) { + get_symbol_pos(addr, symbolsize, offset); + return 1; + } return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 20fef1a38602..27cf24e285e0 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -301,6 +301,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; + if (fatal_signal_pending(current)) + return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; @@ -471,6 +473,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, } } + /* Ensure that these pages are decrypted if SME is enabled. */ + if (pages) + arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); + return pages; } @@ -865,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image, result = -ENOMEM; goto out; } + arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, @@ -882,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image, result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); + arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; goto out; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5cbad4fb9107..d0fe20a5475f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -483,6 +483,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); */ static void do_optimize_kprobes(void) { + lockdep_assert_held(&text_mutex); /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. @@ -500,9 +501,7 @@ static void do_optimize_kprobes(void) list_empty(&optimizing_list)) return; - mutex_lock(&text_mutex); arch_optimize_kprobes(&optimizing_list); - mutex_unlock(&text_mutex); } /* @@ -513,6 +512,7 @@ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; + lockdep_assert_held(&text_mutex); /* See comment in do_optimize_kprobes() */ lockdep_assert_cpus_held(); @@ -520,7 +520,6 @@ static void do_unoptimize_kprobes(void) if (list_empty(&unoptimizing_list)) return; - mutex_lock(&text_mutex); arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { @@ -537,7 +536,6 @@ static void do_unoptimize_kprobes(void) } else list_del_init(&op->list); } - mutex_unlock(&text_mutex); } /* Reclaim all kprobes on the free_list */ @@ -546,8 +544,14 @@ static void do_free_cleaned_kprobes(void) struct optimized_kprobe *op, *tmp; list_for_each_entry_safe(op, tmp, &freeing_list, list) { - BUG_ON(!kprobe_unused(&op->kp)); list_del_init(&op->list); + if (WARN_ON_ONCE(!kprobe_unused(&op->kp))) { + /* + * This must not happen, but if there is a kprobe + * still in use, keep it on kprobes hash list. + */ + continue; + } free_aggr_kprobe(&op->kp); } } @@ -563,6 +567,7 @@ static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); cpus_read_lock(); + mutex_lock(&text_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -590,6 +595,7 @@ static void kprobe_optimizer(struct work_struct *work) do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); + mutex_unlock(&text_mutex); cpus_read_unlock(); mutex_unlock(&kprobe_mutex); @@ -703,7 +709,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) static int reuse_unused_kprobe(struct kprobe *ap) { struct optimized_kprobe *op; - int ret; BUG_ON(!kprobe_unused(ap)); /* @@ -717,9 +722,8 @@ static int reuse_unused_kprobe(struct kprobe *ap) /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ - ret = kprobe_optready(ap); - if (ret) - return ret; + if (!kprobe_optready(ap)) + return -EINVAL; optimize_kprobe(ap); return 0; @@ -1503,7 +1507,8 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || - jump_label_text_reserved(p->addr, p->addr)) { + jump_label_text_reserved(p->addr, p->addr) || + find_bug((unsigned long)p->addr)) { ret = -EINVAL; goto out; } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 7c51f065b212..f8dc77b18962 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -30,6 +30,7 @@ #include <linux/elf.h> #include <linux/moduleloader.h> #include <linux/completion.h> +#include <linux/memory.h> #include <asm/cacheflush.h> #include "core.h" #include "patch.h" @@ -635,16 +636,21 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; + mutex_lock(&text_mutex); + module_disable_ro(patch->mod); ret = klp_write_object_relocations(patch->mod, obj); if (ret) { module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); return ret; } arch_klp_init_object_loaded(patch, obj); module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); + klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, @@ -935,6 +941,7 @@ err: pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", patch->mod->name, obj->mod->name, obj->mod->name); mod->klp_alive = false; + obj->mod = NULL; klp_cleanup_module_patches_limited(mod, patch); mutex_unlock(&klp_mutex); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bf694c709b96..90a3469a7a88 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3402,17 +3402,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (depth && !cross_lock(lock)) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) { - /* - * Check: unsigned int references:12, overflow. - */ - if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) - return 0; + if (!references) + references++; + if (!hlock->references) hlock->references++; - } else { - hlock->references = 2; - } + + hlock->references += references; + + /* Overflow */ + if (DEBUG_LOCKS_WARN_ON(hlock->references < references)) + return 0; return 1; } @@ -3688,6 +3688,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index ad69bbc9bd28..8b2ef15e3552 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -224,7 +224,6 @@ static void lockdep_stats_debug_show(struct seq_file *m) static int lockdep_stats_show(struct seq_file *m, void *v) { - struct lock_class *class; unsigned long nr_unused = 0, nr_uncategorized = 0, nr_irq_safe = 0, nr_irq_unsafe = 0, nr_softirq_safe = 0, nr_softirq_unsafe = 0, @@ -234,6 +233,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, sum_forward_deps = 0; +#ifdef CONFIG_PROVE_LOCKING + struct lock_class *class; + list_for_each_entry(class, &all_lock_classes, lock_entry) { if (class->usage_mask == 0) @@ -265,13 +267,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) nr_hardirq_read_unsafe++; -#ifdef CONFIG_PROVE_LOCKING sum_forward_deps += lockdep_count_forward_deps(class); -#endif } #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif + +#endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 1e882dfc8b79..3a6aeae98a4a 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -247,7 +247,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); + return READ_ONCE(prev->state) != vcpu_running; } /* diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4ad35718f123..71c554a9e17f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1726,12 +1726,33 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, rt_mutex_set_owner(lock, NULL); } +/** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: does _NOT_ remove the @waiter on failure; must either call + * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; + lockdep_assert_held(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; @@ -1749,9 +1770,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, ret = 0; } - if (unlikely(ret)) - remove_waiter(lock, waiter); - debug_rt_mutex_print_deadlock(waiter); return ret; @@ -1763,12 +1781,18 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, * @waiter: the pre-initialized rt_mutex_waiter * @task: the task to prepare * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter + * on failure. + * * Returns: * 0 - task blocked on lock * 1 - acquired the lock for task, caller should wake it up * <0 - error * - * Special API call for FUTEX_REQUEUE_PI support. + * Special API call for PI-futex support. */ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -1778,6 +1802,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, raw_spin_lock_irq(&lock->wait_lock); ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + if (unlikely(ret)) + remove_waiter(lock, waiter); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -1845,7 +1871,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, * @lock: the rt_mutex we were woken on * @waiter: the pre-initialized rt_mutex_waiter * - * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). + * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or + * rt_mutex_wait_proxy_lock(). * * Unless we acquired the lock; we're still enqueued on the wait-list and can * in fact still be granted ownership until we're removed. Therefore we can diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a90336779375..3f5be624c764 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -130,6 +130,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, { struct rwsem_waiter *waiter, *tmp; long oldcount, woken = 0, adjustment = 0; + struct list_head wlist; /* * Take a peek at the queue head waiter such that we can determine @@ -188,26 +189,25 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * of the queue. We know that woken will be at least 1 as we accounted * for above. Note we increment the 'active part' of the count by the * number of readers before waking any processes up. + * + * We have to do wakeup in 2 passes to prevent the possibility that + * the reader count may be decremented before it is incremented. It + * is because the to-be-woken waiter may not have slept yet. So it + * may see waiter->task got cleared, finish its critical section and + * do an unlock before the reader count increment. + * + * 1) Collect the read-waiters in a separate list, count them and + * fully increment the reader count in rwsem. + * 2) For each waiters in the new list, clear waiter->task and + * put them into wake_q to be woken up later. */ - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { - struct task_struct *tsk; - + list_for_each_entry(waiter, &sem->wait_list, list) { if (waiter->type == RWSEM_WAITING_FOR_WRITE) break; woken++; - tsk = waiter->task; - - wake_q_add(wake_q, tsk); - list_del(&waiter->list); - /* - * Ensure that the last operation is setting the reader - * waiter to nil such that rwsem_down_read_failed() cannot - * race with do_exit() by always holding a reference count - * to the task to wakeup. - */ - smp_store_release(&waiter->task, NULL); } + list_cut_before(&wlist, &sem->wait_list, &waiter->list); adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; if (list_empty(&sem->wait_list)) { @@ -217,6 +217,29 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, if (adjustment) atomic_long_add(adjustment, &sem->count); + + /* 2nd pass */ + list_for_each_entry_safe(waiter, tmp, &wlist, list) { + struct task_struct *tsk; + + tsk = waiter->task; + get_task_struct(tsk); + + /* + * Ensure calling get_task_struct() before setting the reader + * waiter to nil such that rwsem_down_read_failed() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. + */ + smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add(wake_q, tsk); + /* wake_q_add() already take the task ref */ + put_task_struct(tsk); + } } /* diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 9aa0fccd5d43..03595c29c566 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -51,19 +51,19 @@ EXPORT_SYMBOL(__rwlock_init); static void spin_dump(raw_spinlock_t *lock, const char *msg) { - struct task_struct *owner = NULL; + struct task_struct *owner = READ_ONCE(lock->owner); - if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) - owner = lock->owner; + if (owner == SPINLOCK_OWNER_INIT) + owner = NULL; printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", msg, raw_smp_processor_id(), current->comm, task_pid_nr(current)); printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " ".owner_cpu: %d\n", - lock, lock->magic, + lock, READ_ONCE(lock->magic), owner ? owner->comm : "<none>", owner ? task_pid_nr(owner) : -1, - lock->owner_cpu); + READ_ONCE(lock->owner_cpu)); dump_stack(); } @@ -80,16 +80,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg) static inline void debug_spin_lock_before(raw_spinlock_t *lock) { - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(lock->owner == current, lock, "recursion"); - SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } static inline void debug_spin_lock_after(raw_spinlock_t *lock) { - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; + WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id()); + WRITE_ONCE(lock->owner, current); } static inline void debug_spin_unlock(raw_spinlock_t *lock) @@ -99,8 +99,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock) SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; + WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT); + WRITE_ONCE(lock->owner_cpu, -1); } /* @@ -183,8 +183,8 @@ static inline void debug_write_lock_before(rwlock_t *lock) static inline void debug_write_lock_after(rwlock_t *lock) { - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; + WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id()); + WRITE_ONCE(lock->owner, current); } static inline void debug_write_unlock(rwlock_t *lock) @@ -193,8 +193,8 @@ static inline void debug_write_unlock(rwlock_t *lock) RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; + WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT); + WRITE_ONCE(lock->owner_cpu, -1); } void do_raw_write_lock(rwlock_t *lock) diff --git a/kernel/module.c b/kernel/module.c index 2a44c515f0d7..feb1e0fbc3e8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1020,6 +1020,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); free_module(mod); + /* someone could wait for the module in add_unformed_module() */ + wake_up_all(&module_wq); return 0; out: mutex_unlock(&module_mutex); @@ -1201,8 +1203,10 @@ static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, const char *buffer, size_t count) { - kobject_synth_uevent(&mk->kobj, buffer, count); - return count; + int rc; + + rc = kobject_synth_uevent(&mk->kobj, buffer, count); + return rc ? rc : count; } struct module_attribute module_uevent = @@ -1693,6 +1697,8 @@ static int add_usage_links(struct module *mod) return ret; } +static void module_remove_modinfo_attrs(struct module *mod, int end); + static int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; @@ -1707,24 +1713,34 @@ static int module_add_modinfo_attrs(struct module *mod) return -ENOMEM; temp_attr = mod->modinfo_attrs; - for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { + for (i = 0; (attr = modinfo_attrs[i]); i++) { if (!attr->test || attr->test(mod)) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); error = sysfs_create_file(&mod->mkobj.kobj, &temp_attr->attr); + if (error) + goto error_out; ++temp_attr; } } + + return 0; + +error_out: + if (i > 0) + module_remove_modinfo_attrs(mod, --i); return error; } -static void module_remove_modinfo_attrs(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod, int end) { struct module_attribute *attr; int i; for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { + if (end >= 0 && i > end) + break; /* pick a field to test for end of list */ if (!attr->attr.name) break; @@ -1812,7 +1828,7 @@ static int mod_sysfs_setup(struct module *mod, return 0; out_unreg_modinfo_attrs: - module_remove_modinfo_attrs(mod); + module_remove_modinfo_attrs(mod, -1); out_unreg_param: module_param_sysfs_remove(mod); out_unreg_holders: @@ -1848,7 +1864,7 @@ static void mod_sysfs_fini(struct module *mod) { } -static void module_remove_modinfo_attrs(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod, int end) { } @@ -1864,7 +1880,7 @@ static void init_param_lock(struct module *mod) static void mod_sysfs_teardown(struct module *mod) { del_usage_links(mod); - module_remove_modinfo_attrs(mod); + module_remove_modinfo_attrs(mod, -1); module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); kobject_put(mod->holders_dir); @@ -3389,8 +3405,7 @@ static bool finished_loading(const char *name) sched_annotate_sleep(); mutex_lock(&module_mutex); mod = find_module_all(name, strlen(name), true); - ret = !mod || mod->state == MODULE_STATE_LIVE - || mod->state == MODULE_STATE_GOING; + ret = !mod || mod->state == MODULE_STATE_LIVE; mutex_unlock(&module_mutex); return ret; @@ -3558,8 +3573,7 @@ again: mutex_lock(&module_mutex); old = find_module_all(mod->name, strlen(mod->name), true); if (old != NULL) { - if (old->state == MODULE_STATE_COMING - || old->state == MODULE_STATE_UNFORMED) { + if (old->state != MODULE_STATE_LIVE) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); err = wait_event_interruptible(module_wq, diff --git a/kernel/padata.c b/kernel/padata.c index 868f947166d7..87540ce72aea 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -265,7 +265,12 @@ static void padata_reorder(struct parallel_data *pd) * The next object that needs serialization might have arrived to * the reorder queues in the meantime, we will be called again * from the timer function if no one else cares for it. + * + * Ensure reorder_objects is read after pd->lock is dropped so we see + * an increment from another task in padata_do_serial. Pairs with + * smp_mb__after_atomic in padata_do_serial. */ + smp_mb(); if (atomic_read(&pd->reorder_objects) && !(pinst->flags & PADATA_RESET)) mod_timer(&pd->timer, jiffies + HZ); @@ -334,6 +339,13 @@ void padata_do_serial(struct padata_priv *padata) list_add_tail(&padata->list, &pqueue->reorder.list); spin_unlock(&pqueue->reorder.lock); + /* + * Ensure the atomic_inc of reorder_objects above is ordered correctly + * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb + * in padata_reorder. + */ + smp_mb__after_atomic(); + put_cpu(); padata_reorder(pd); diff --git a/kernel/panic.c b/kernel/panic.c index 32ff6fd30201..207ceac3a432 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -146,6 +146,7 @@ void panic(const char *fmt, ...) * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); + preempt_disable_notrace(); /* * It's possible to come here directly from a panic-assertion and diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 4918314893bc..22091c88b3bb 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -351,7 +351,7 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) } read_lock(&tasklist_lock); - force_sig(SIGKILL, pid_ns->child_reaper); + send_sig(SIGKILL, pid_ns->child_reaper, 1); read_unlock(&tasklist_lock); do_exit(0); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a5c36e9c56a6..2d6d14ad7b4f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -258,6 +258,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop, (kps % 1000) / 10); } +__weak int arch_resume_nosmt(void) +{ + return 0; +} + /** * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -322,6 +327,10 @@ static int create_image(int platform_mode) Enable_cpus: enable_nonboot_cpus(); + /* Allow architectures to do nosmt-specific post-resume dances */ + if (!in_suspend) + error = arch_resume_nosmt(); + Platform_finish: platform_finish(platform_mode); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0972a8e09d08..ff2aabb70de9 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -734,8 +734,15 @@ zone_found: * We have found the zone. Now walk the radix tree to find the leaf node * for our PFN. */ + + /* + * If the zone we wish to scan is the the current zone and the + * pfn falls into the current node then we do not need to walk + * the tree. + */ node = bm->cur.node; - if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + if (zone == bm->cur.zone && + ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; node = zone->rtree; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2e2c86dd226f..4e50beb162c0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -432,6 +432,7 @@ static u32 clear_idx; /* record buffer */ #define LOG_ALIGN __alignof__(struct printk_log) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -1032,18 +1033,23 @@ void log_buf_vmcoreinfo_setup(void) static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ -static void __init log_buf_len_update(unsigned size) +static void __init log_buf_len_update(u64 size) { + if (size > (u64)LOG_BUF_LEN_MAX) { + size = (u64)LOG_BUF_LEN_MAX; + pr_err("log_buf over 2G is not supported.\n"); + } + if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) - new_log_buf_len = size; + new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { - unsigned int size; + u64 size; if (!str) return -EINVAL; @@ -1093,7 +1099,7 @@ void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; - int free; + unsigned int free; if (log_buf != __log_buf) return; @@ -1113,7 +1119,7 @@ void __init setup_log_buf(int early) } if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %ld bytes not available\n", + pr_err("log_buf_len: %lu bytes not available\n", new_log_buf_len); return; } @@ -1126,8 +1132,8 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); logbuf_unlock_irqrestore(flags); - pr_info("log_buf_len: %d bytes\n", log_buf_len); - pr_info("early log buf free: %d(%d%%)\n", + pr_info("log_buf_len: %u bytes\n", log_buf_len); + pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); } @@ -3189,7 +3195,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - while (l > size && seq < dumper->next_seq) { + while (l >= size && seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); l -= msg_print_text(msg, true, NULL, 0); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 84b1367935e4..09fb3f58a838 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -29,6 +29,7 @@ #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> #include <linux/compat.h> +#include <linux/sched/signal.h> /* * Access another process' address space via ptrace. @@ -77,9 +78,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, */ static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - rcu_read_lock(); - __ptrace_link(child, new_parent, __task_cred(new_parent)); - rcu_read_unlock(); + __ptrace_link(child, new_parent, current_cred()); } /** @@ -322,6 +321,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) return -EPERM; ok: rcu_read_unlock(); + /* + * If a task drops privileges and becomes nondumpable (through a syscall + * like setresuid()) while we are trying to access it, we must ensure + * that the dumpability is read after the credentials; otherwise, + * we may be able to attach to a task that we shouldn't be able to + * attach to (as if the task had dropped privileges without becoming + * nondumpable). + * Pairs with a write barrier in commit_creds(). + */ + smp_rmb(); mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && @@ -703,6 +712,10 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (arg.nr < 0) return -EINVAL; + /* Ensure arg.off fits in an unsigned long */ + if (arg.off > ULONG_MAX) + return 0; + if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) pending = &child->signal->shared_pending; else @@ -710,18 +723,20 @@ static int ptrace_peek_siginfo(struct task_struct *child, for (i = 0; i < arg.nr; ) { siginfo_t info; - s32 off = arg.off + i; + unsigned long off = arg.off + i; + bool found = false; spin_lock_irq(&child->sighand->siglock); list_for_each_entry(q, &pending->list, list) { if (!off--) { + found = true; copy_siginfo(&info, &q->info); break; } } spin_unlock_irq(&child->sighand->siglock); - if (off >= 0) /* beyond the end of the list */ + if (!found) /* beyond the end of the list */ break; #ifdef CONFIG_COMPAT @@ -925,18 +940,26 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; - case PTRACE_GETSIGMASK: + case PTRACE_GETSIGMASK: { + sigset_t *mask; + if (addr != sizeof(sigset_t)) { ret = -EINVAL; break; } - if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) + if (test_tsk_restore_sigmask(child)) + mask = &child->saved_sigmask; + else + mask = &child->blocked; + + if (copy_to_user(datavp, mask, sizeof(sigset_t))) ret = -EFAULT; else ret = 0; break; + } case PTRACE_SETSIGMASK: { sigset_t new_set; @@ -962,6 +985,8 @@ int ptrace_request(struct task_struct *child, long request, child->blocked = new_set; spin_unlock_irq(&child->sighand->siglock); + clear_tsk_restore_sigmask(child); + ret = 0; break; } diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 1f87a02c3399..9b0d38812eb6 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -542,6 +542,10 @@ rcu_perf_cleanup(void) if (torture_cleanup_begin()) return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) @@ -663,6 +667,7 @@ rcu_perf_init(void) pr_alert(" %s", perf_ops[i]->name); pr_alert("\n"); firsterr = -EINVAL; + cur_ops = NULL; goto unwind; } if (cur_ops->init) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 45f2ffbc1e78..f0c599bf4058 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1599,6 +1599,10 @@ rcu_torture_cleanup(void) cur_ops->cb_barrier(); return; } + if (!cur_ops) { + torture_cleanup_end(); + return; + } rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_stall, stall_task); @@ -1734,6 +1738,7 @@ rcu_torture_init(void) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); firsterr = -EINVAL; + cur_ops = NULL; goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 710ce1d6b982..fb051fa99b67 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1789,15 +1789,23 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) } /* - * Awaken the grace-period kthread for the specified flavor of RCU. - * Don't do a self-awaken, and don't bother awakening when there is - * nothing for the grace-period kthread to do (as in several CPUs - * raced to awaken, and we lost), and finally don't try to awaken - * a kthread that has not yet been created. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in + * an interrupt or softirq handler), and don't bother awakening when there + * is nothing for the grace-period kthread to do (as in several CPUs raced + * to awaken, and we lost), and finally don't try to awaken a kthread that + * has not yet been created. If all those checks are passed, track some + * debug information and awaken. + * + * So why do the self-wakeup when in an interrupt or softirq handler + * in the grace-period kthread's context? Because the kthread might have + * been interrupted just as it was going to sleep, and just after the final + * pre-sleep check of the awaken condition. In this case, a wakeup really + * is required, and is therefore supplied. */ static void rcu_gp_kthread_wake(struct rcu_state *rsp) { - if (current == rsp->gp_kthread || + if ((current == rsp->gp_kthread && + !in_interrupt() && !in_serving_softirq()) || !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; diff --git a/kernel/relay.c b/kernel/relay.c index 1537158c67b3..61d37e6da22d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -427,6 +427,8 @@ static struct dentry *relay_create_buf_file(struct rchan *chan, dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, buf, &chan->is_global); + if (IS_ERR(dentry)) + dentry = NULL; kfree(tmpname); @@ -460,7 +462,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) dentry = chan->cb->create_buf_file(NULL, NULL, S_IRUSR, buf, &chan->is_global); - if (WARN_ON(dentry)) + if (IS_ERR_OR_NULL(dentry)) goto free_buf; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0552ddbb25e2..97a27726ea21 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -432,10 +432,11 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * its already queued (either by us or someone else) and will get the * wakeup due to that. * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_q(). + * In order to ensure that a pending wakeup will observe our pending + * state, even in the failed case, an explicit smp_mb() must be used. */ - if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + smp_mb__before_atomic(); + if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) return; get_task_struct(task); @@ -1111,7 +1112,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } @@ -1132,7 +1134,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ @@ -5026,7 +5027,7 @@ long __sched io_schedule_timeout(long timeout) } EXPORT_SYMBOL(io_schedule_timeout); -void io_schedule(void) +void __sched io_schedule(void) { int token; @@ -5241,10 +5242,11 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + __sched_fork(0, idle); + raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock(&rq->lock); - __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; @@ -6342,10 +6344,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#else - /* We don't support RT-tasks being in separate groups */ - if (task->sched_class != &fair_sched_class) - return -EINVAL; #endif /* * Serialize against wake_up_new_task() such that if its @@ -6380,6 +6378,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + if (shareval > scale_load_down(ULONG_MAX)) + shareval = MAX_SHARES; return sched_group_set_shares(css_tg(css), scale_load(shareval)); } @@ -6482,8 +6482,10 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) period = ktime_to_ns(tg->cfs_bandwidth.period); if (cfs_quota_us < 0) quota = RUNTIME_INF; - else + else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) quota = (u64)cfs_quota_us * NSEC_PER_USEC; + else + return -EINVAL; return tg_set_cfs_bandwidth(tg, period, quota); } @@ -6505,6 +6507,9 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { u64 quota, period; + if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6d689c039189..eae7db21dc04 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -600,15 +600,15 @@ out: return 0; fail: + kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; sugov_tunables_free(tunables); stop_kthread: sugov_kthread_stop(sg_policy); - -free_sg_policy: mutex_unlock(&global_tunables_lock); +free_sg_policy: sugov_policy_free(sg_policy); disable_fast_switch: diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 14d2dbf97c53..45c2cd37fe6b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -738,7 +738,7 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); /* We might have scheduled out from guest path */ - if (current->flags & PF_VCPU) + if (tsk->flags & PF_VCPU) vtime_account_guest(tsk, vtime); else __vtime_account_system(tsk, vtime); @@ -781,7 +781,7 @@ void vtime_guest_enter(struct task_struct *tsk) */ write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk, vtime); - current->flags |= PF_VCPU; + tsk->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -792,7 +792,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); - current->flags &= ~PF_VCPU; + tsk->flags &= ~PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b2589c7e9439..22770168bff8 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -217,7 +217,6 @@ static void task_non_contending(struct task_struct *p) if (dl_se->dl_runtime == 0) return; - WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); zerolag_time = dl_se->deadline - @@ -234,7 +233,7 @@ static void task_non_contending(struct task_struct *p) * If the "0-lag time" already passed, decrease the active * utilization now, instead of starting a timer */ - if (zerolag_time < 0) { + if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se->dl_bw, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f93e4a2d9f6..187c04a34ba1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -339,6 +339,7 @@ void register_sched_domain_sysctl(void) { static struct ctl_table *cpu_entries; static struct ctl_table **cpu_idx; + static bool init_done = false; char buf[32]; int i; @@ -368,7 +369,10 @@ void register_sched_domain_sysctl(void) if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; + } + if (!init_done) { + init_done = true; /* init to possible to not have holes in @cpu_entries */ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f33b24080b1c..0b4e997fea1a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2026,6 +2026,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) if (p->last_task_numa_placement) { delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; + + /* Avoid time going backwards, prevent potential divide error: */ + if (unlikely((s64)*period < 0)) + *period = 0; } else { delta = p->se.avg.load_sum / p->se.load.weight; *period = LOAD_AVG_MAX; @@ -2354,13 +2358,23 @@ no_join: return; } -void task_numa_free(struct task_struct *p) +/* + * Get rid of NUMA staticstics associated with a task (either current or dead). + * If @final is set, the task is dead and has reached refcount zero, so we can + * safely free all relevant data structures. Otherwise, there might be + * concurrent reads from places like load balancing and procfs, and we should + * reset the data back to default state without freeing ->numa_faults. + */ +void task_numa_free(struct task_struct *p, bool final) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults; + unsigned long *numa_faults = p->numa_faults; unsigned long flags; int i; + if (!numa_faults) + return; + if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) @@ -2373,8 +2387,14 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults = NULL; - kfree(numa_faults); + if (final) { + p->numa_faults = NULL; + kfree(numa_faults); + } else { + p->total_numa_faults = 0; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + numa_faults[i] = 0; + } } /* @@ -4071,23 +4091,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4109,8 +4122,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - int expires_seq; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4127,65 +4139,23 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires_seq = cfs_b->expires_seq; - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if (cfs_rq->expires_seq != expires_seq) { - cfs_rq->expires_seq = expires_seq; - cfs_rq->runtime_expires = expires; - } return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline(cfs_b->expires_seq) has advanced. - */ - if (cfs_rq->expires_seq == cfs_b->expires_seq) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; + if (cfs_rq->throttled) + return; /* * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled @@ -4365,8 +4335,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -4382,13 +4351,15 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, if (!cfs_rq_throttled(cfs_rq)) goto next; + /* By the above check, this should never be true */ + SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining; remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -4413,7 +4384,7 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4441,8 +4412,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -4455,8 +4424,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); cfs_b->distribute_running = 0; @@ -4533,8 +4501,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -4566,7 +4533,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); @@ -4583,7 +4549,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -4592,11 +4557,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); - if (expires == cfs_b->runtime_expires) - cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->runtime -= min(runtime, cfs_b->runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -4672,12 +4636,15 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } +extern const u64 max_cfs_quota_period; + static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); int overrun; int idle = 0; + int count = 0; raw_spin_lock(&cfs_b->lock); for (;;) { @@ -4685,6 +4652,36 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + if (++count > 3) { + u64 new, old = ktime_to_ns(cfs_b->period); + + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } + + /* reset count so we don't come right back in here */ + count = 0; + } + idle = do_sched_cfs_period_timer(cfs_b, overrun); } if (idle) @@ -5651,6 +5648,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); +EXPORT_SYMBOL_GPL(sched_smt_present); static inline void set_idle_cores(int cpu, int val) { @@ -7017,10 +7015,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) if (cfs_rq->last_h_load_update == now) return; - cfs_rq->h_load_next = NULL; + WRITE_ONCE(cfs_rq->h_load_next, NULL); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - cfs_rq->h_load_next = se; + WRITE_ONCE(cfs_rq->h_load_next, se); if (cfs_rq->last_h_load_update == now) break; } @@ -7030,7 +7028,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) cfs_rq->last_h_load_update = now; } - while ((se = cfs_rq->h_load_next) != NULL) { + while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) { load = cfs_rq->h_load; load = div64_ul(load * se->avg.load_avg, cfs_rq_load_avg(cfs_rq) + 1); @@ -8308,9 +8306,10 @@ more_balance: out_balanced: /* * We reach balance although we may have faced some affinity - * constraints. Clear the imbalance flag if it was set. + * constraints. Clear the imbalance flag only if other tasks got + * a chance to move and fix the imbalance. */ - if (sd_parent) { + if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; if (*group_imbalance) @@ -8328,13 +8327,22 @@ out_all_pinned: sd->nr_balance_failed = 0; out_one_pinned: + ld_moved = 0; + + /* + * idle_balance() disregards balance intervals, so we could repeatedly + * reach this code, which would lead to balance_interval skyrocketting + * in a short amount of time. Skip the balance_interval increase logic + * to avoid that. + */ + if (env.idle == CPU_NEWLY_IDLE) + goto out; + /* tune up the balancing interval */ if (((env.flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - - ld_moved = 0; out: return ld_moved; } @@ -9371,18 +9379,18 @@ err: void online_fair_sched_group(struct task_group *tg) { struct sched_entity *se; + struct rq_flags rf; struct rq *rq; int i; for_each_possible_cpu(i) { rq = cpu_rq(i); se = tg->se[i]; - - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index cb9a5b8532fa..cc7dd1aaf08e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2533,6 +2533,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; if (rt_runtime_us < 0) rt_runtime = RUNTIME_INF; + else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } @@ -2553,6 +2555,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) { u64 rt_runtime, rt_period; + if (rt_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + rt_period = rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 452b56923c6d..268f560ec998 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -280,8 +280,6 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchical_quota; - u64 runtime_expires; - int expires_seq; short idle, period_active; struct hrtimer period_timer, slack_timer; @@ -489,8 +487,6 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; - int expires_seq; - u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock, throttled_clock_task; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 659e075ef70b..867d173dab48 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -499,7 +499,7 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); struct s_data { - struct sched_domain ** __percpu sd; + struct sched_domain * __percpu *sd; struct root_domain *rd; }; @@ -1347,7 +1347,7 @@ void sched_init_numa(void) int level = 0; int i, j, k; - sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); if (!sched_domains_numa_distance) return; diff --git a/kernel/signal.c b/kernel/signal.c index 164c36ef0825..c9b203875001 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -77,6 +77,10 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) handler = sig_handler(t, sig); + /* SIGKILL and SIGSTOP may not be sent to the global init */ + if (unlikely(is_global_init(t) && sig_kernel_only(sig))) + return true; + if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) return 1; @@ -672,6 +676,48 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) kick_process(t); } +static int dequeue_synchronous_signal(siginfo_t *info) +{ + struct task_struct *tsk = current; + struct sigpending *pending = &tsk->pending; + struct sigqueue *q, *sync = NULL; + + /* + * Might a synchronous signal be in the queue? + */ + if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK)) + return 0; + + /* + * Return the first synchronous signal in the queue. + */ + list_for_each_entry(q, &pending->list, list) { + /* Synchronous signals have a postive si_code */ + if ((q->info.si_code > SI_USER) && + (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { + sync = q; + goto next; + } + } + return 0; +next: + /* + * Check if there is another siginfo for the same signal. + */ + list_for_each_entry_continue(q, &pending->list, list) { + if (q->info.si_signo == sync->info.si_signo) + goto still_pending; + } + + sigdelset(&pending->signal, sync->info.si_signo); + recalc_sigpending(); +still_pending: + list_del_init(&sync->list); + copy_siginfo(info, &sync->info); + __sigqueue_free(sync); + return info->si_signo; +} + /* * Remove signals in mask from the pending set and queue. * Returns 1 if any signals were found. @@ -2225,6 +2271,16 @@ relock: goto relock; } + /* Has this task already been marked for death? */ + if (signal_group_exit(signal)) { + ksig->info.si_signo = signr = SIGKILL; + sigdelset(¤t->pending.signal, SIGKILL); + trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, + &sighand->action[SIGKILL - 1]); + recalc_sigpending(); + goto fatal; + } + for (;;) { struct k_sigaction *ka; @@ -2238,7 +2294,15 @@ relock: goto relock; } - signr = dequeue_signal(current, ¤t->blocked, &ksig->info); + /* + * Signals generated by the execution of an instruction + * need to be delivered before any other pending signals + * so that the instruction pointer in the signal stack + * frame points to the faulting instruction. + */ + signr = dequeue_synchronous_signal(&ksig->info); + if (!signr) + signr = dequeue_signal(current, ¤t->blocked, &ksig->info); if (!signr) break; /* will return 0 */ @@ -2320,6 +2384,7 @@ relock: continue; } + fatal: spin_unlock_irq(&sighand->siglock); /* diff --git a/kernel/smp.c b/kernel/smp.c index 2d1da290f144..c94dd85c8d41 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -584,8 +584,6 @@ void __init smp_init(void) num_nodes, (num_nodes > 1 ? "s" : ""), num_cpus, (num_cpus > 1 ? "s" : "")); - /* Final decision about SMT support */ - cpu_smt_check_topology(); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } diff --git a/kernel/sys.c b/kernel/sys.c index e25ec93aea22..ab96b9882347 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1861,7 +1861,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d330b1ce3b94..74fc3a9d1923 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,7 +124,9 @@ static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; +static unsigned long zero_ul; static unsigned long one_ul = 1; +static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int one_thousand = 1000; #ifdef CONFIG_PRINTK @@ -1395,7 +1397,7 @@ static struct ctl_table vm_table[] = { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = &one, .extra2 = &four, @@ -1681,6 +1683,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &long_max, }, { .procname = "nr_open", @@ -2530,7 +2534,16 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, { struct do_proc_dointvec_minmax_conv_param *param = data; if (write) { - int val = *negp ? -*lvalp : *lvalp; + int val; + if (*negp) { + if (*lvalp > (unsigned long) INT_MAX + 1) + return -EINVAL; + val = -*lvalp; + } else { + if (*lvalp > (unsigned long) INT_MAX) + return -EINVAL; + val = *lvalp; + } if ((param->min && *param->min > val) || (param->max && *param->max < val)) return -EINVAL; @@ -2708,6 +2721,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int bool neg; left -= proc_skip_spaces(&p); + if (!left) + break; err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, @@ -2717,8 +2732,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (neg) continue; val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) - continue; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } *i = val; } else { val = convdiv * (*i) / convmul; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4559e914452b..390c76d4503c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -568,25 +568,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; - struct taskstats *stats; + struct taskstats *stats_new, *stats; - if (sig->stats || thread_group_empty(tsk)) - goto ret; + /* Pairs with smp_store_release() below. */ + stats = smp_load_acquire(&sig->stats); + if (stats || thread_group_empty(tsk)) + return stats; /* No problem if kmem_cache_zalloc() fails */ - stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); + stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); spin_lock_irq(&tsk->sighand->siglock); - if (!sig->stats) { - sig->stats = stats; - stats = NULL; + stats = sig->stats; + if (!stats) { + /* + * Pairs with smp_store_release() above and order the + * kmem_cache_zalloc(). + */ + smp_store_release(&sig->stats, stats_new); + stats = stats_new; + stats_new = NULL; } spin_unlock_irq(&tsk->sighand->siglock); - if (stats) - kmem_cache_free(taskstats_cache, stats); -ret: - return sig->stats; + if (stats_new) + kmem_cache_free(taskstats_cache, stats_new); + + return stats; } /* Send pid data out on exit */ diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index fa5de5e8de61..f4255a65c44b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -597,7 +597,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return ktime_sub(now, alarm->node.expires); + return ktime_sub(alarm->node.expires, now); } /** @@ -676,7 +676,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) enum alarmtimer_type type; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!capable(CAP_WAKE_ALARM)) return -EPERM; @@ -794,7 +794,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, int ret = 0; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EOPNOTSUPP; if (flags & ~TIMER_ABSTIME) return -EINVAL; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d00e85ac10d6..ecce9122343b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -845,7 +845,8 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -867,7 +868,8 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; - timer->state = newstate; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; @@ -894,8 +896,9 @@ static void __remove_hrtimer(struct hrtimer *timer, static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { - if (hrtimer_is_queued(timer)) { - u8 state = timer->state; + u8 state = timer->state; + + if (state & HRTIMER_STATE_ENQUEUED) { int reprogram; /* diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 99e03bec68e4..9288532f73c8 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -43,6 +43,7 @@ static u64 tick_length_base; #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +#define MAX_TAI_OFFSET 100000 /* * phase-lock loop variables @@ -640,7 +641,8 @@ static inline void process_adjtimex_modes(struct timex *txc, time_constant = max(time_constant, 0l); } - if (txc->modes & ADJ_TAI && txc->constant > 0) + if (txc->modes & ADJ_TAI && + txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 17cdc554c9fe..e5706a826c1f 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -27,8 +27,6 @@ #include "posix-timers.h" -static void delete_clock(struct kref *kref); - /* * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. */ @@ -138,7 +136,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) err = 0; if (!err) { - kref_get(&clk->kref); + get_device(clk->dev); fp->private_data = clk; } out: @@ -154,7 +152,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp) if (clk->ops.release) err = clk->ops.release(clk); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); fp->private_data = NULL; @@ -174,38 +172,35 @@ static const struct file_operations posix_clock_file_operations = { #endif }; -int posix_clock_register(struct posix_clock *clk, dev_t devid) +int posix_clock_register(struct posix_clock *clk, struct device *dev) { int err; - kref_init(&clk->kref); init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); + err = cdev_device_add(&clk->cdev, dev); + if (err) { + pr_err("%s unable to add device %d:%d\n", + dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); + return err; + } clk->cdev.owner = clk->ops.owner; - err = cdev_add(&clk->cdev, devid, 1); + clk->dev = dev; - return err; + return 0; } EXPORT_SYMBOL_GPL(posix_clock_register); -static void delete_clock(struct kref *kref) -{ - struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - - if (clk->release) - clk->release(clk); -} - void posix_clock_unregister(struct posix_clock *clk) { - cdev_del(&clk->cdev); + cdev_device_del(&clk->cdev, clk->dev); down_write(&clk->rwsem); clk->zombie = true; up_write(&clk->rwsem); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); } EXPORT_SYMBOL_GPL(posix_clock_unregister); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 58045eb976c3..c750c80570e8 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -44,34 +44,39 @@ static int bc_shutdown(struct clock_event_device *evt) */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { - int bc_moved; /* - * We try to cancel the timer first. If the callback is on - * flight on some other cpu then we let it handle it. If we - * were able to cancel the timer nothing can rearm it as we - * own broadcast_lock. + * This is called either from enter/exit idle code or from the + * broadcast handler. In all cases tick_broadcast_lock is held. * - * However we can also be called from the event handler of - * ce_broadcast_hrtimer itself when it expires. We cannot - * restart the timer because we are in the callback, but we - * can set the expiry time and let the callback return - * HRTIMER_RESTART. + * hrtimer_cancel() cannot be called here neither from the + * broadcast handler nor from the enter/exit idle code. The idle + * code can run into the problem described in bc_shutdown() and the + * broadcast handler cannot wait for itself to complete for obvious + * reasons. * - * Since we are in the idle loop at this point and because - * hrtimer_{start/cancel} functions call into tracing, - * calls to these functions must be bound within RCU_NONIDLE. + * Each caller tries to arm the hrtimer on its own CPU, but if the + * hrtimer callbback function is currently running, then + * hrtimer_start() cannot move it and the timer stays on the CPU on + * which it is assigned at the moment. + * + * As this can be called from idle code, the hrtimer_start() + * invocation has to be wrapped with RCU_NONIDLE() as + * hrtimer_start() can call into tracing. */ - RCU_NONIDLE({ - bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; - if (bc_moved) - hrtimer_start(&bctimer, expires, - HRTIMER_MODE_ABS_PINNED);}); - if (bc_moved) { - /* Bind the "device" to the cpu */ - bc->bound_on = smp_processor_id(); - } else if (bc->bound_on == smp_processor_id()) { - hrtimer_set_expires(&bctimer, expires); - } + RCU_NONIDLE( { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. + * + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. + */ + bc->bound_on = bctimer.base->cpu_base->cpu; + } ); return 0; } @@ -97,10 +102,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) - if (ce_broadcast_hrtimer.next_event != KTIME_MAX) - return HRTIMER_RESTART; - return HRTIMER_NORESTART; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2cafb49aa65e..1ce7c404d0b0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -41,7 +41,9 @@ static struct { seqcount_t seq; struct timekeeper timekeeper; -} tk_core ____cacheline_aligned; +} tk_core ____cacheline_aligned = { + .seq = SEQCNT_ZERO(tk_core.seq), +}; static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f17c76a1a05f..9f8e8892e5b0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1545,21 +1545,23 @@ void timer_clear_idle(void) static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { + unsigned long now = READ_ONCE(jiffies); + /* * NOHZ optimization. After a long idle sleep we need to forward the * base to current jiffies. Avoid a loop by searching the bitfield for * the next expiring timer. */ - if ((long)(jiffies - base->clk) > 2) { + if ((long)(now - base->clk) > 2) { unsigned long next = __next_timer_interrupt(base); /* * If the next timer is ahead of time forward to current * jiffies, otherwise forward to the next expiry time: */ - if (time_after(next, jiffies)) { + if (time_after(next, now)) { /* The call site will increment clock! */ - base->clk = jiffies - 1; + base->clk = now - 1; return 0; } base->clk = next; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 0ed768b56c60..7e9f149d34ea 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -289,23 +289,6 @@ static inline void timer_list_header(struct seq_file *m, u64 now) SEQ_printf(m, "\n"); } -static int timer_list_show(struct seq_file *m, void *v) -{ - struct timer_list_iter *iter = v; - - if (iter->cpu == -1 && !iter->second_pass) - timer_list_header(m, iter->now); - else if (!iter->second_pass) - print_cpu(m, iter->cpu, iter->now); -#ifdef CONFIG_GENERIC_CLOCKEVENTS - else if (iter->cpu == -1 && iter->second_pass) - timer_list_show_tickdevices_header(m); - else - print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); -#endif - return 0; -} - void sysrq_timer_list_show(void) { u64 now = ktime_to_ns(ktime_get()); @@ -324,6 +307,24 @@ void sysrq_timer_list_show(void) return; } +#ifdef CONFIG_PROC_FS +static int timer_list_show(struct seq_file *m, void *v) +{ + struct timer_list_iter *iter = v; + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, iter->now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + static void *move_iter(struct timer_list_iter *iter, loff_t offset) { for (; offset; offset--) { @@ -395,3 +396,4 @@ static int __init init_timer_list_procfs(void) return 0; } __initcall(init_timer_list_procfs); +#endif diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9937d7cf2a64..3864d2341442 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,6 +33,7 @@ #include <linux/list.h> #include <linux/hash.h> #include <linux/rcupdate.h> +#include <linux/kprobes.h> #include <trace/events/sched.h> @@ -621,8 +622,7 @@ static int function_stat_show(struct seq_file *m, void *v) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER - avg = rec->time; - do_div(avg, rec->counter); + avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) goto out; #endif @@ -648,7 +648,8 @@ static int function_stat_show(struct seq_file *m, void *v) * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, + rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); @@ -1711,6 +1712,11 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); +static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); + static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1839,15 +1845,17 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, } /* - * If the rec had TRAMP enabled, then it needs to - * be cleared. As TRAMP can only be enabled iff - * there is only a single ops attached to it. - * In otherwords, always disable it on decrementing. - * In the future, we may set it if rec count is - * decremented to one, and the ops that is left - * has a trampoline. + * The TRAMP needs to be set only if rec count + * is decremented to one, and the ops that is + * left has a trampoline. As TRAMP can only be + * enabled if there is only a single ops attached + * to it. */ - rec->flags &= ~FTRACE_FL_TRAMP; + if (ftrace_rec_count(rec) == 1 && + ftrace_find_tramp_ops_any(rec)) + rec->flags |= FTRACE_FL_TRAMP; + else + rec->flags &= ~FTRACE_FL_TRAMP; /* * flags will be cleared in ftrace_check_record() @@ -2040,11 +2048,6 @@ static void print_ip_ins(const char *fmt, const unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static struct ftrace_ops * -ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); -static struct ftrace_ops * -ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); - enum ftrace_bug_type ftrace_bug_type; const void *ftrace_expected; @@ -3181,6 +3184,14 @@ t_probe_next(struct seq_file *m, loff_t *pos) hnd = &iter->probe_entry->hlist; hash = iter->probe->ops.func_hash->filter_hash; + + /* + * A probe being registered may temporarily have an empty hash + * and it's at the end of the func_probes list. + */ + if (!hash || hash == EMPTY_HASH) + return NULL; + size = 1 << hash->size_bits; retry: @@ -3618,21 +3629,22 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, struct ftrace_hash *hash; struct list_head *mod_head; struct trace_array *tr = ops->private; - int ret = 0; + int ret = -ENOMEM; ftrace_ops_init(ops); if (unlikely(ftrace_disabled)) return -ENODEV; + if (tr && trace_array_get(tr) < 0) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) - return -ENOMEM; + goto out; - if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { - kfree(iter); - return -ENOMEM; - } + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) + goto out; iter->ops = ops; iter->flags = flag; @@ -3662,13 +3674,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, if (!iter->hash) { trace_parser_put(&iter->parser); - kfree(iter); - ret = -ENOMEM; goto out_unlock; } } else iter->hash = hash; + ret = 0; + if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -3680,7 +3692,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, /* Failed */ free_ftrace_hash(iter->hash); trace_parser_put(&iter->parser); - kfree(iter); } } else file->private_data = iter; @@ -3688,6 +3699,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, out_unlock: mutex_unlock(&ops->func_hash->regex_lock); + out: + if (ret) { + kfree(iter); + if (tr) + trace_array_put(tr); + } + return ret; } @@ -4279,10 +4297,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, struct ftrace_func_entry *entry; struct ftrace_func_map *map; struct hlist_head *hhd; - int size = 1 << mapper->hash.size_bits; - int i; + int size, i; + + if (!mapper) + return; if (free_func && mapper->hash.count) { + size = 1 << mapper->hash.size_bits; for (i = 0; i < size; i++) { hhd = &mapper->hash.buckets[i]; hlist_for_each_entry(entry, hhd, hlist) { @@ -4374,12 +4395,21 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, mutex_unlock(&ftrace_lock); + /* + * Note, there's a small window here that the func_hash->filter_hash + * may be NULL or empty. Need to be carefule when reading the loop. + */ mutex_lock(&probe->ops.func_hash->regex_lock); orig_hash = &probe->ops.func_hash->filter_hash; old_hash = *orig_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) { + ret = -ENOMEM; + goto out; + } + ret = ftrace_match_records(hash, glob, strlen(glob)); /* Nothing found? */ @@ -5075,6 +5105,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&iter->ops->func_hash->regex_lock); free_ftrace_hash(iter->hash); + if (iter->tr) + trace_array_put(iter->tr); kfree(iter); return 0; @@ -6035,7 +6067,7 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static inline void +static nokprobe_inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { @@ -6098,11 +6130,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, { __ftrace_ops_list_func(ip, parent_ip, NULL, regs); } +NOKPROBE_SYMBOL(ftrace_ops_list_func); #else static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } +NOKPROBE_SYMBOL(ftrace_ops_no_ops); #endif /* @@ -6132,6 +6166,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); trace_clear_recursion(bit); } +NOKPROBE_SYMBOL(ftrace_ops_assist_func); /** * ftrace_ops_get_func - get the function a trampoline should call diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a1d5e0949dcf..8123a8b53c54 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -700,7 +700,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) preempt_disable_notrace(); time = rb_time_stamp(buffer); - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return time; } @@ -4010,6 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over + * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer recording @@ -4027,7 +4028,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -4035,7 +4036,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc(sizeof(*iter), flags); if (!iter) return NULL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e9cbb96cd99e..c456c2b06277 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -494,8 +494,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, * not modified. */ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) + if (!pid_list) { + trace_parser_put(&parser); return -ENOMEM; + } pid_list->pid_max = READ_ONCE(pid_max); @@ -505,6 +507,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); if (!pid_list->pids) { + trace_parser_put(&parser); kfree(pid_list); return -ENOMEM; } @@ -3381,6 +3384,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file const char tgid_space[] = " "; const char space[] = " "; + print_event_info(buf, m); + seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? tgid_space : space); seq_printf(m, "# %s / _----=> need-resched\n", @@ -3899,7 +3904,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -3909,7 +3915,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); @@ -4145,9 +4152,14 @@ static int show_traces_open(struct inode *inode, struct file *file) if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + ret = seq_open(file, &show_traces_seq_ops); - if (ret) + if (ret) { + trace_array_put(tr); return ret; + } m = file->private_data; m->private = tr; @@ -4155,6 +4167,14 @@ static int show_traces_open(struct inode *inode, struct file *file) return 0; } +static int show_traces_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return seq_release(inode, file); +} + static ssize_t tracing_write_stub(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) @@ -4185,8 +4205,8 @@ static const struct file_operations tracing_fops = { static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, - .release = seq_release, .llseek = seq_lseek, + .release = show_traces_release, }; static ssize_t @@ -4348,6 +4368,10 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { + if ((mask == TRACE_ITER_RECORD_TGID) || + (mask == TRACE_ITER_RECORD_CMD)) + lockdep_assert_held(&event_mutex); + /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) return 0; @@ -4413,6 +4437,7 @@ static int trace_set_options(struct trace_array *tr, char *option) cmp += 2; } + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); for (i = 0; trace_options[i]; i++) { @@ -4427,6 +4452,7 @@ static int trace_set_options(struct trace_array *tr, char *option) ret = set_tracer_option(tr, cmp, neg); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); /* * If the first trailing whitespace is replaced with '\0' by strstrip, @@ -5602,7 +5628,6 @@ out: return ret; fail: - kfree(iter->trace); kfree(iter); __trace_array_put(tr); mutex_unlock(&trace_types_lock); @@ -5745,6 +5770,7 @@ waitagain: sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); cpumask_clear(iter->started); + trace_seq_init(&iter->seq); iter->pos = -1; trace_event_read_lock(); @@ -6388,11 +6414,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, break; } #endif - if (!tr->allocated_snapshot) { + if (tr->allocated_snapshot) + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, iter->cpu_file); + else ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) - break; - } + if (ret < 0) + break; local_irq_disable(); /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) @@ -6713,28 +6741,36 @@ struct buffer_ref { struct ring_buffer *buffer; void *page; int cpu; - int ref; + refcount_t refcount; }; +static void buffer_ref_release(struct buffer_ref *ref) +{ + if (!refcount_dec_and_test(&ref->refcount)) + return; + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); + kfree(ref); +} + static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); buf->private = 0; } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - ref->ref++; + if (refcount_read(&ref->refcount) > INT_MAX/2) + return false; + + refcount_inc(&ref->refcount); + return true; } /* Pipe buffer operations for a buffer. */ @@ -6742,7 +6778,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; @@ -6755,11 +6791,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); spd->partial[i].private = 0; } @@ -6814,7 +6846,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - ref->ref = 1; + refcount_set(&ref->refcount, 1); ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { @@ -7347,9 +7379,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = set_tracer_flag(tr, 1 << index, val); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); if (ret < 0) return ret; @@ -8239,12 +8273,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) cnt++; - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; if (trace_find_next_entry_inc(&iter) != NULL) { int ret; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 851cd1605085..dbb212c40a41 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -504,12 +504,44 @@ enum { * can only be modified by current, we can reuse trace_recursion. */ TRACE_IRQ_BIT, + + /* Set if the function is in the set_graph_function file */ + TRACE_GRAPH_BIT, + + /* + * In the very unlikely case that an interrupt came in + * at a start of graph tracing, and we want to trace + * the function in that interrupt, the depth can be greater + * than zero, because of the preempted start of a previous + * trace. In an even more unlikely case, depth could be 2 + * if a softirq interrupted the start of graph tracing, + * followed by an interrupt preempting a start of graph + * tracing in the softirq, and depth can even be 3 + * if an NMI came in at the start of an interrupt function + * that preempted a softirq start of a function that + * preempted normal context!!!! Luckily, it can't be + * greater than 3, so the next two bits are a mask + * of what the depth is when we set TRACE_GRAPH_BIT + */ + + TRACE_GRAPH_DEPTH_START_BIT, + TRACE_GRAPH_DEPTH_END_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) +#define trace_recursion_depth() \ + (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) +#define trace_recursion_set_depth(depth) \ + do { \ + current->trace_recursion &= \ + ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ + current->trace_recursion |= \ + ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ + } while (0) + #define TRACE_CONTEXT_BITS 4 #define TRACE_FTRACE_START TRACE_FTRACE_BIT @@ -839,8 +871,9 @@ extern void __trace_graph_return(struct trace_array *tr, extern struct ftrace_hash *ftrace_graph_hash; extern struct ftrace_hash *ftrace_graph_notrace_hash; -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { + unsigned long addr = trace->func; int ret = 0; preempt_disable_notrace(); @@ -851,6 +884,14 @@ static inline int ftrace_graph_addr(unsigned long addr) } if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + + /* + * This needs to be cleared on the return functions + * when the depth is zero. + */ + trace_recursion_set(TRACE_GRAPH_BIT); + trace_recursion_set_depth(trace->depth); + /* * If no irqs are to be traced, but a set_graph_function * is set, and called by an interrupt handler, we still @@ -868,6 +909,13 @@ out: return ret; } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ + if (trace_recursion_test(TRACE_GRAPH_BIT) && + trace->depth == trace_recursion_depth()) + trace_recursion_clear(TRACE_GRAPH_BIT); +} + static inline int ftrace_graph_notrace_addr(unsigned long addr) { int ret = 0; @@ -881,7 +929,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) return ret; } #else -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { return 1; } @@ -890,6 +938,8 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) { return 0; } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ } #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; @@ -897,7 +947,8 @@ extern unsigned int fgraph_max_depth; static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) { /* trace it when it is-nested-in or is a function enabled. */ - return !(trace->depth || ftrace_graph_addr(trace->func)) || + return !(trace_recursion_test(TRACE_GRAPH_BIT) || + ftrace_graph_addr(trace)) || (trace->depth < 0) || (fgraph_max_depth && trace->depth >= fgraph_max_depth); } @@ -1820,4 +1871,22 @@ static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) extern struct trace_iterator *tracepoint_print_iter; +/* + * Reset the state of the trace_iterator so that it can read consumed data. + * Normally, the trace_iterator is used for reading the data when it is not + * consumed, and must retain state. + */ +static __always_inline void trace_iterator_reset(struct trace_iterator *iter) +{ + const size_t offset = offsetof(struct trace_iterator, seq); + + /* + * Keep gcc from complaining about overwriting more than just one + * member in the structure. + */ + memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); + + iter->pos = -1; +} + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d53268a4e167..2b0a01b2be2d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -326,7 +326,8 @@ void trace_event_enable_cmd_record(bool enable) struct trace_event_file *file; struct trace_array *tr; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) @@ -340,7 +341,6 @@ void trace_event_enable_cmd_record(bool enable) clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); - mutex_unlock(&event_mutex); } void trace_event_enable_tgid_record(bool enable) @@ -348,7 +348,8 @@ void trace_event_enable_tgid_record(bool enable) struct trace_event_file *file; struct trace_array *tr; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; @@ -362,7 +363,6 @@ void trace_event_enable_tgid_record(bool enable) &file->flags); } } while_for_each_event_file(); - mutex_unlock(&event_mutex); } static int __ftrace_event_enable_disable(struct trace_event_file *file, @@ -1319,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) char buf[32]; int len; - if (*ppos) - return 0; - if (unlikely(!id)) return -ENODEV; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7eb975a2d0e1..e8c9eba9b1e7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -872,9 +872,10 @@ static inline void add_to_key(char *compound_key, void *key, /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - } - memcpy(compound_key + key_field->offset, key, size); + strncpy(compound_key + key_field->offset, (char *)key, size); + } else + memcpy(compound_key + key_field->offset, key, size); } static void event_hist_trigger(struct event_trigger_data *data, void *rec) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 169b3c44ee97..72d0d477f5c1 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -482,6 +482,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) int cpu; int pc; + ftrace_graph_addr_finish(trace); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -505,6 +507,8 @@ void set_graph_array(struct trace_array *tr) static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) { + ftrace_graph_addr_finish(trace); + if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) return; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index d7c8e4ec3d9d..bff6c033d70e 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -152,7 +152,7 @@ void trace_hwlat_callback(bool enter) if (enter) nmi_ts_start = time_get(); else - nmi_total_ts = time_get() - nmi_ts_start; + nmi_total_ts += time_get() - nmi_ts_start; } if (enter) @@ -258,6 +258,8 @@ static int get_sample(void) /* Keep a running maximum ever recorded hardware latency */ if (sample > tr->max_latency) tr->max_latency = sample; + if (outer_sample > tr->max_latency) + tr->max_latency = outer_sample; } out: diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7758bc0617cb..2d9e12380dc3 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -204,6 +204,8 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; int pc; + ftrace_graph_addr_finish(trace); + if (!func_prolog_dec(tr, &data, &flags)) return; diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index d953c163a079..2905a3dd94c1 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -41,24 +41,22 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) kdb_printf("Dumping ftrace buffer:\n"); - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu_file, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7d461dcd4831..0fa9dadf3f4f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -270,6 +270,8 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; int pc; + ftrace_graph_addr_finish(trace); + if (!func_prolog_preempt_disable(tr, &data, &pc)) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ea0d90a31fc9..fdf2ea4d64ec 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -17,7 +17,7 @@ * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> */ -#define pr_fmt(fmt) "trace_kprobe: " fmt +#define pr_fmt(fmt) "trace_uprobe: " fmt #include <linux/module.h> #include <linux/uaccess.h> @@ -153,7 +153,14 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, ret = strncpy_from_user(dst, src, maxlen); if (ret == maxlen) - dst[--ret] = '\0'; + dst[ret - 1] = '\0'; + else if (ret >= 0) + /* + * Include the terminating null byte. In this case it + * was copied by strncpy_from_user but not accounted + * for in ret. + */ + ret++; if (ret < 0) { /* Failed to fetch string */ ((u8 *)get_rloc_data(dest))[0] = '\0'; diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 305039b122fa..35b2ba07f3c6 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -90,8 +90,8 @@ static int tracing_map_cmp_atomic64(void *val_a, void *val_b) #define DEFINE_TRACING_MAP_CMP_FN(type) \ static int tracing_map_cmp_##type(void *val_a, void *val_b) \ { \ - type a = *(type *)val_a; \ - type b = *(type *)val_b; \ + type a = (type)(*(u64 *)val_a); \ + type b = (type)(*(u64 *)val_b); \ \ return (a > b) ? 1 : ((a < b) ? -1 : 0); \ } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 08bc551976b2..a37f5dc7cb39 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2366,8 +2366,14 @@ repeat: */ if (need_to_create_worker(pool)) { spin_lock(&wq_mayday_lock); - get_pwq(pwq); - list_move_tail(&pwq->mayday_node, &wq->maydays); + /* + * Queue iff we aren't racing destruction + * and somebody else hasn't queued it already. + */ + if (wq->rescuer && list_empty(&pwq->mayday_node)) { + get_pwq(pwq); + list_add_tail(&pwq->mayday_node, &wq->maydays); + } spin_unlock(&wq_mayday_lock); } } @@ -4084,9 +4090,29 @@ void destroy_workqueue(struct workqueue_struct *wq) struct pool_workqueue *pwq; int node; + /* + * Remove it from sysfs first so that sanity check failure doesn't + * lead to sysfs name conflicts. + */ + workqueue_sysfs_unregister(wq); + /* drain it before proceeding with destruction */ drain_workqueue(wq); + /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ + if (wq->rescuer) { + struct worker *rescuer = wq->rescuer; + + /* this prevents new queueing */ + spin_lock_irq(&wq_mayday_lock); + wq->rescuer = NULL; + spin_unlock_irq(&wq_mayday_lock); + + /* rescuer will empty maydays list before exiting */ + kthread_stop(rescuer->task); + kfree(rescuer); + } + /* sanity checks */ mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { @@ -4118,11 +4144,6 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); - workqueue_sysfs_unregister(wq); - - if (wq->rescuer) - kthread_stop(wq->rescuer->task); - if (!(wq->flags & WQ_UNBOUND)) { /* * The base ref is never dropped on per-cpu pwqs. Directly @@ -4399,7 +4420,8 @@ static void show_pwq(struct pool_workqueue *pwq) pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + pr_cont(" active=%d/%d refcnt=%d%s\n", + pwq->nr_active, pwq->max_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { |