From f5b98461cb8167ba362ad9f74c41d126b7becea7 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 6 Jan 2017 19:32:01 +0100 Subject: random: use chacha20 for get_random_int/long Now that our crng uses chacha20, we can rely on its speedy characteristics for replacing MD5, while simultaneously achieving a higher security guarantee. Before the idea was to use these functions if you wanted random integers that aren't stupidly insecure but aren't necessarily secure either, a vague gray zone, that hopefully was "good enough" for its users. With chacha20, we can strengthen this claim, since either we're using an rdrand-like instruction, or we're using the same crng as /dev/urandom. And it's faster than what was before. We could have chosen to replace this with a SipHash-derived function, which might be slightly faster, but at the cost of having yet another RNG construction in the kernel. By moving to chacha20, we have a single RNG to analyze and verify, and we also already get good performance improvements on all platforms. Implementation-wise, rather than use a generic buffer for both get_random_int/long and memcpy based on the size needs, we use a specific buffer for 32-bit reads and for 64-bit reads. This way, we're guaranteed to always have aligned accesses on all platforms. While slightly more verbose in C, the assembly this generates is a lot simpler than otherwise. Finally, on 32-bit platforms where longs and ints are the same size, we simply alias get_random_int to get_random_long. Signed-off-by: Jason A. Donenfeld Suggested-by: Theodore Ts'o Cc: Theodore Ts'o Cc: Hannes Frederic Sowa Cc: Andy Lutomirski Signed-off-by: Theodore Ts'o --- include/linux/random.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 7bd2403e4fef..16ab429735a7 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -37,7 +37,6 @@ extern void get_random_bytes(void *buf, int nbytes); extern int add_random_ready_callback(struct random_ready_callback *rdy); extern void del_random_ready_callback(struct random_ready_callback *rdy); extern void get_random_bytes_arch(void *buf, int nbytes); -extern int random_int_secret_init(void); #ifndef MODULE extern const struct file_operations random_fops, urandom_fops; -- cgit v1.2.3 From c440408cf6901eeb2c09563397e24a9097907078 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 22 Jan 2017 16:34:08 +0100 Subject: random: convert get_random_int/long into get_random_u32/u64 Many times, when a user wants a random number, he wants a random number of a guaranteed size. So, thinking of get_random_int and get_random_long in terms of get_random_u32 and get_random_u64 makes it much easier to achieve this. It also makes the code simpler. On 32-bit platforms, get_random_int and get_random_long are both aliased to get_random_u32. On 64-bit platforms, int->u32 and long->u64. Signed-off-by: Jason A. Donenfeld Cc: Greg Kroah-Hartman Cc: Theodore Ts'o Signed-off-by: Theodore Ts'o --- include/linux/random.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 16ab429735a7..ed5c3838780d 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -42,8 +42,21 @@ extern void get_random_bytes_arch(void *buf, int nbytes); extern const struct file_operations random_fops, urandom_fops; #endif -unsigned int get_random_int(void); -unsigned long get_random_long(void); +u32 get_random_u32(void); +u64 get_random_u64(void); +static inline unsigned int get_random_int(void) +{ + return get_random_u32(); +} +static inline unsigned long get_random_long(void) +{ +#if BITS_PER_LONG == 64 + return get_random_u64(); +#else + return get_random_u32(); +#endif +} + unsigned long randomize_page(unsigned long start, unsigned long range); u32 prandom_u32(void); -- cgit v1.2.3 From cd8d860dcce906cd477be7d0648ba6f56a52eaa6 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 28 Feb 2017 11:32:22 -0500 Subject: jump_label: Fix anonymous union initialization Pre-4.6 gcc do not allow direct static initialization of members of anonymous structs/unions. After commit 3821fd35b58d ("jump_label: Reduce the size of struct static_key") STATIC_KEY_INIT_{TRUE|FALSE} definitions cannot be compiled with those older compilers. Placing initializers inside curved brackets works around this problem. Link: http://lkml.kernel.org/r/1488299542-30765-1-git-send-email-boris.ostrovsky@oracle.com Fixes: 3821fd35b58d ("jump_label: Reduce the size of struct static_key") Reviewed-by: Jason Baron Compiled-by: Chris Mason Signed-off-by: Boris Ostrovsky Signed-off-by: Steven Rostedt (VMware) --- include/linux/jump_label.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 680c98b2f41c..a7f90117cf7d 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -166,10 +166,10 @@ extern void static_key_disable(struct static_key *key); */ #define STATIC_KEY_INIT_TRUE \ { .enabled = { 1 }, \ - .entries = (void *)JUMP_TYPE_TRUE } + { .entries = (void *)JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ { .enabled = { 0 }, \ - .entries = (void *)JUMP_TYPE_FALSE } + { .entries = (void *)JUMP_TYPE_FALSE } } #else /* !HAVE_JUMP_LABEL */ -- cgit v1.2.3 From b17ef2ed624aa7c1f68ed11acd16ecbf80fe01d7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 2 Mar 2017 17:28:45 -0500 Subject: jump_label: Add comment about initialization order for anonymous unions Commit 3821fd35b58d ("jump_label: Reduce the size of struct static_key") broke old compilers that could not handle static initialization of anonymous unions. Boris fixed it with a patch that added brackets around the static initializer. But this creates a dependency between those initializers and the structure's order of its fields. Document this dependency in case new fields are added to struct static_key in the future. Noted-by: Boris Ostrovsky Suggested-by: Chris Mason Signed-off-by: Steven Rostedt (VMware) --- include/linux/jump_label.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index a7f90117cf7d..28e04a33535a 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -90,6 +90,13 @@ extern bool static_key_initialized; struct static_key { atomic_t enabled; /* + * Note: + * To make anonymous unions work with old compilers, the static + * initialization of them requires brackets. This creates a dependency + * on the order of the struct with the initializers. If any fields + * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need + * to be modified. + * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod -- cgit v1.2.3 From 4dfc050571523ac2bc02cbf948dd47621f7dd83f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 21 Feb 2017 11:32:47 +0000 Subject: KVM: arm/arm64: vgic-v3: Don't pretend to support IRQ/FIQ bypass Our GICv3 emulation always presents ICC_SRE_EL1 with DIB/DFB set to zero, which implies that there is a way to bypass the GIC and inject raw IRQ/FIQ by driving the CPU pins. Of course, we don't allow that when the GIC is configured, but we fail to indicate that to the guest. The obvious fix is to set these bits (and never let them being changed again). Reported-by: Peter Maydell Acked-by: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 672cfef72fc8..97cbca19430d 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -373,6 +373,8 @@ #define ICC_IGRPEN0_EL1_MASK (1 << ICC_IGRPEN0_EL1_SHIFT) #define ICC_IGRPEN1_EL1_SHIFT 0 #define ICC_IGRPEN1_EL1_MASK (1 << ICC_IGRPEN1_EL1_SHIFT) +#define ICC_SRE_EL1_DIB (1U << 2) +#define ICC_SRE_EL1_DFB (1U << 1) #define ICC_SRE_EL1_SRE (1U << 0) /* -- cgit v1.2.3 From b3e228473e6cec7cf83b4025b4570c8066ab2dd8 Mon Sep 17 00:00:00 2001 From: Mian Yousaf Kaukab Date: Thu, 2 Mar 2017 16:11:47 +0100 Subject: irqdomain: Add empty irq_domain_check_msi_remap Fix following build error for s390: drivers/vfio/vfio_iommu_type1.c: In function 'vfio_iommu_type1_attach_group': drivers/vfio/vfio_iommu_type1.c:1290:25: error: implicit declaration of function 'irq_domain_check_msi_remap' Acked-by: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Marc Zyngier --- include/linux/irqdomain.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 188eced6813e..9f3616085423 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -524,6 +524,10 @@ static inline struct irq_domain *irq_find_matching_fwnode( { return NULL; } +static inline bool irq_domain_check_msi_remap(void) +{ + return false; +} #endif /* !CONFIG_IRQ_DOMAIN */ #endif /* _LINUX_IRQDOMAIN_H */ -- cgit v1.2.3 From 040757f738e13caaa9c5078bca79aa97e11dde88 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 5 Mar 2017 15:03:22 -0600 Subject: ucount: Remove the atomicity from ucount->count Always increment/decrement ucount->count under the ucounts_lock. The increments are there already and moving the decrements there means the locking logic of the code is simpler. This simplification in the locking logic fixes a race between put_ucounts and get_ucounts that could result in a use-after-free because the count could go zero then be found by get_ucounts and then be freed by put_ucounts. A bug presumably this one was found by a combination of syzkaller and KASAN. JongWhan Kim reported the syzkaller failure and Dmitry Vyukov spotted the race in the code. Cc: stable@vger.kernel.org Fixes: f6b2db1a3e8d ("userns: Make the count of user namespaces per user") Reported-by: JongHwan Kim Reported-by: Dmitry Vyukov Reviewed-by: Andrei Vagin Signed-off-by: "Eric W. Biederman" --- include/linux/user_namespace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index be765234c0a2..32354b4b4b2b 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -72,7 +72,7 @@ struct ucounts { struct hlist_node node; struct user_namespace *ns; kuid_t uid; - atomic_t count; + int count; atomic_t ucount[UCOUNT_COUNTS]; }; -- cgit v1.2.3 From 7cc5e38f2f0b0b58a22a4c18a56348dd99a71270 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 12 Feb 2017 17:11:07 +0100 Subject: libceph: osd_request_timeout option osd_request_timeout specifies how many seconds to wait for a response from OSDs before returning -ETIMEDOUT from an OSD request. 0 (default) means no limit. osd_request_timeout is osdkeepalive-precise -- in-flight requests are swept through every osdkeepalive seconds. With ack vs commit behaviour gone, abort_request() is really simple. This is based on a patch from Artur Molchanov . Tested-by: Artur Molchanov Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/libceph.h | 2 ++ include/linux/ceph/osd_client.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 1816c5e26581..88cd5dc8e238 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -48,6 +48,7 @@ struct ceph_options { unsigned long mount_timeout; /* jiffies */ unsigned long osd_idle_ttl; /* jiffies */ unsigned long osd_keepalive_timeout; /* jiffies */ + unsigned long osd_request_timeout; /* jiffies */ /* * any type that can't be simply compared or doesn't need need @@ -68,6 +69,7 @@ struct ceph_options { #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */ #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2ea0c282f3dc..c125b5d9e13c 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -189,6 +189,7 @@ struct ceph_osd_request { /* internal */ unsigned long r_stamp; /* jiffies, send or check time */ + unsigned long r_start_stamp; /* jiffies */ int r_attempts; struct ceph_eversion r_replay_version; /* aka reassert_version */ u32 r_last_force_resend; -- cgit v1.2.3 From c01228db4ba965986511a5b28c478bddd7e2726e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Mar 2017 17:48:34 +0100 Subject: Revert "scsi, block: fix duplicate bdi name registration crashes" This reverts commit 0dba1314d4f81115dce711292ec7981d17231064. It causes leaking of device numbers for SCSI when SCSI registers multiple gendisks for one request_queue in succession. It can be easily reproduced using Omar's script [1] on kernel with CONFIG_DEBUG_TEST_DRIVER_REMOVE. Furthermore the protection provided by this commit is not needed anymore as the problem it was fixing got also fixed by commit 165a5e22fafb "block: Move bdi_unregister() to del_gendisk()". [1]: http://marc.info/?l=linux-block&m=148554717109098&w=2 Signed-off-by: Jan Kara Acked-by: Dan Williams Tested-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - include/linux/genhd.h | 8 -------- 2 files changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 796016e63c1d..5a7da607ca04 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -435,7 +435,6 @@ struct request_queue { struct delayed_work delay_work; struct backing_dev_info *backing_dev_info; - struct disk_devt *disk_devt; /* * The queue owner gets to use this for whatever they like. diff --git a/include/linux/genhd.h b/include/linux/genhd.h index a999d281a2f1..76f39754e7b0 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -167,13 +167,6 @@ struct blk_integrity { }; #endif /* CONFIG_BLK_DEV_INTEGRITY */ -struct disk_devt { - atomic_t count; - void (*release)(struct disk_devt *disk_devt); -}; - -void put_disk_devt(struct disk_devt *disk_devt); -void get_disk_devt(struct disk_devt *disk_devt); struct gendisk { /* major, first_minor and minors are input parameters only, @@ -183,7 +176,6 @@ struct gendisk { int first_minor; int minors; /* maximum number of minors, =1 for * disks that can't be partitioned. */ - struct disk_devt *disk_devt; char disk_name[DISK_NAME_LEN]; /* name of major driver */ char *(*devnode)(struct gendisk *gd, umode_t *mode); -- cgit v1.2.3 From bd0f9b356d00aa241ced36fb075a07041c28d3b8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 7 Mar 2017 15:33:14 -0800 Subject: sched/headers: fix up header file dependency on The scheduler header file split and cleanups ended up exposing a few nasty header file dependencies, and in particular it showed how we in ended up depending on "signal_pending()", which now comes from . That's a very subtle and annoying dependency, which already caused a semantic merge conflict (see commit e58bc927835a "Pull overlayfs updates from Miklos Szeredi", which added that fixup in the merge commit). It turns out that we can avoid this dependency _and_ improve code generation by moving the guts of the fairly nasty helper #define __wait_event_interruptible_locked() to out-of-line code. The code that includes the signal_pending() check is all in the slow-path where we actually go to sleep waiting for the event anyway, so using a helper function is the right thing to do. Using a helper function is also what we already did for the non-locked versions, see the "__wait_event*()" macros and the "prepare_to_wait*()" set of helper functions. We might want to try to unify all these macro games, we have a _lot_ of subtly different wait-event loops. But this is the minimal patch to fix the annoying header dependency. Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- include/linux/wait.h | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index aacb1282d19a..db076ca7f11d 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -620,30 +620,19 @@ do { \ __ret; \ }) +extern int do_wait_intr(wait_queue_head_t *, wait_queue_t *); +extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); -#define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \ +#define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ ({ \ - int __ret = 0; \ + int __ret; \ DEFINE_WAIT(__wait); \ if (exclusive) \ __wait.flags |= WQ_FLAG_EXCLUSIVE; \ do { \ - if (likely(list_empty(&__wait.task_list))) \ - __add_wait_queue_tail(&(wq), &__wait); \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (signal_pending(current)) { \ - __ret = -ERESTARTSYS; \ + __ret = fn(&(wq), &__wait); \ + if (__ret) \ break; \ - } \ - if (irq) \ - spin_unlock_irq(&(wq).lock); \ - else \ - spin_unlock(&(wq).lock); \ - schedule(); \ - if (irq) \ - spin_lock_irq(&(wq).lock); \ - else \ - spin_lock(&(wq).lock); \ } while (!(condition)); \ __remove_wait_queue(&(wq), &__wait); \ __set_current_state(TASK_RUNNING); \ @@ -676,7 +665,7 @@ do { \ */ #define wait_event_interruptible_locked(wq, condition) \ ((condition) \ - ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 0)) + ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr)) /** * wait_event_interruptible_locked_irq - sleep until a condition gets true @@ -703,7 +692,7 @@ do { \ */ #define wait_event_interruptible_locked_irq(wq, condition) \ ((condition) \ - ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 1)) + ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq)) /** * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true @@ -734,7 +723,7 @@ do { \ */ #define wait_event_interruptible_exclusive_locked(wq, condition) \ ((condition) \ - ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 0)) + ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr)) /** * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true @@ -765,7 +754,7 @@ do { \ */ #define wait_event_interruptible_exclusive_locked_irq(wq, condition) \ ((condition) \ - ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1)) + ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq)) #define __wait_event_killable(wq, condition) \ -- cgit v1.2.3 From 505a60e225606fbd3d2eadc31ff793d939ba66f1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 9 Mar 2017 17:24:03 +0300 Subject: asm-generic: introduce 5level-fixup.h We are going to switch core MM to 5-level paging abstraction. This is preparation step which adds As with 4level-fixup.h, the new header allows quickly make all architectures compatible with 5-level paging in core MM. In long run we would like to switch architectures to properly folded p4d level by using , but it requires more changes to arch-specific code. Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d65dd72c0f4..be1fe264eb37 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1619,11 +1619,14 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); * Remove it when 4level-fixup.h has been removed. */ #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) + +#ifndef __ARCH_HAS_5LEVEL_HACK static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? NULL: pud_offset(pgd, address); } +#endif /* !__ARCH_HAS_5LEVEL_HACK */ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { -- cgit v1.2.3 From c2febafc67734a62196c1b9dfba926412d4077ba Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 9 Mar 2017 17:24:07 +0300 Subject: mm: convert generic code to 5-level paging Convert all non-architecture-specific code to 5-level paging. It's mostly mechanical adding handling one more page table level in places where we deal with pud_t. Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 ++++- include/linux/kasan.h | 1 + include/linux/mm.h | 31 +++++++++++++++++++++++++------ 3 files changed, 30 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 503099d8aada..b857fc8cc2ec 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -122,7 +122,7 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags); int pmd_huge(pmd_t pmd); -int pud_huge(pud_t pmd); +int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); @@ -197,6 +197,9 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb, #ifndef pgd_huge #define pgd_huge(x) 0 #endif +#ifndef p4d_huge +#define p4d_huge(x) 0 +#endif #ifndef pgd_write static inline int pgd_write(pgd_t pgd) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index ceb3fe78a0d3..1c823bef4c15 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -18,6 +18,7 @@ extern unsigned char kasan_zero_page[PAGE_SIZE]; extern pte_t kasan_zero_pte[PTRS_PER_PTE]; extern pmd_t kasan_zero_pmd[PTRS_PER_PMD]; extern pud_t kasan_zero_pud[PTRS_PER_PUD]; +extern p4d_t kasan_zero_p4d[PTRS_PER_P4D]; void kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end); diff --git a/include/linux/mm.h b/include/linux/mm.h index be1fe264eb37..5f01c88f0800 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1560,14 +1560,24 @@ static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, return ptep; } +#ifdef __PAGETABLE_P4D_FOLDED +static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, + unsigned long address) +{ + return 0; +} +#else +int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); +#endif + #ifdef __PAGETABLE_PUD_FOLDED -static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, +static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } #else -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); +int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) @@ -1621,10 +1631,18 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) #ifndef __ARCH_HAS_5LEVEL_HACK -static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, + unsigned long address) +{ + return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ? + NULL : p4d_offset(pgd, address); +} + +static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, + unsigned long address) { - return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? - NULL: pud_offset(pgd, address); + return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? + NULL : pud_offset(p4d, address); } #endif /* !__ARCH_HAS_5LEVEL_HACK */ @@ -2388,7 +2406,8 @@ void sparse_mem_maps_populate_node(struct page **map_map, struct page *sparse_mem_map_populate(unsigned long pnum, int nid); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); -pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); +p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); +pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); void *vmemmap_alloc_block(unsigned long size, int node); -- cgit v1.2.3 From 8a1115ff6b6d90cf1066ec3a0c4e51276553eebe Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 9 Mar 2017 16:16:31 -0800 Subject: scripts/spelling.txt: add "disble(d)" pattern and fix typo instances Fix typos and add the following to the scripts/spelling.txt: disble||disable disbled||disabled I kept the TSL2563_INT_DISBLED in /drivers/iio/light/tsl2563.c untouched. The macro is not referenced at all, but this commit is touching only comment blocks just in case. Link: http://lkml.kernel.org/r/1481573103-11329-20-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/regulator/machine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index ad3e5158e586..c9f795e9a2ee 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -65,7 +65,7 @@ struct regulator_state { int uV; /* suspend voltage */ unsigned int mode; /* suspend regulator operating mode */ int enabled; /* is regulator enabled in this suspend state */ - int disabled; /* is the regulator disbled in this suspend state */ + int disabled; /* is the regulator disabled in this suspend state */ }; /** -- cgit v1.2.3 From dd0db88d8094a6d9d4d1fc5fcd56ab619f54ccf8 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Mar 2017 16:16:49 -0800 Subject: userfaultfd: non-cooperative: rollback userfaultfd_exit Patch series "userfaultfd non-cooperative further update for 4.11 merge window". Unfortunately I noticed one relevant bug in userfaultfd_exit while doing more testing. I've been doing testing before and this was also tested by kbuild bot and exercised by the selftest, but this bug never reproduced before. I dropped userfaultfd_exit as result. I dropped it because of implementation difficulty in receiving signals in __mmput and because I think -ENOSPC as result from the background UFFDIO_COPY should be enough already. Before I decided to remove userfaultfd_exit, I noticed userfaultfd_exit wasn't exercised by the selftest and when I tried to exercise it, after moving it to a more correct place in __mmput where it would make more sense and where the vma list is stable, it resulted in the event_wait_completion in D state. So then I added the second patch to be sure even if we call userfaultfd_event_wait_completion too late during task exit(), we won't risk to generate tasks in D state. The same check exists in handle_userfault() for the same reason, except it makes a difference there, while here is just a robustness check and it's run under WARN_ON_ONCE. While looking at the userfaultfd_event_wait_completion() function I looked back at its callers too while at it and I think it's not ok to stop executing dup_fctx on the fcs list because we relay on userfaultfd_event_wait_completion to execute userfaultfd_ctx_put(fctx->orig) which is paired against userfaultfd_ctx_get(fctx->orig) in dup_userfault just before list_add(fcs). This change only takes care of fctx->orig but this area also needs further review looking for similar problems in fctx->new. The only patch that is urgent is the first because it's an use after free during a SMP race condition that affects all processes if CONFIG_USERFAULTFD=y. Very hard to reproduce though and probably impossible without SLUB poisoning enabled. This patch (of 3): I once reproduced this oops with the userfaultfd selftest, it's not easily reproducible and it requires SLUB poisoning to reproduce. general protection fault: 0000 [#1] SMP Modules linked in: CPU: 2 PID: 18421 Comm: userfaultfd Tainted: G ------------ T 3.10.0+ #15 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.1-0-g8891697-prebuilt.qemu-project.org 04/01/2014 task: ffff8801f83b9440 ti: ffff8801f833c000 task.ti: ffff8801f833c000 RIP: 0010:[] [] userfaultfd_exit+0x29/0xa0 RSP: 0018:ffff8801f833fe80 EFLAGS: 00010202 RAX: ffff8801f833ffd8 RBX: 6b6b6b6b6b6b6b6b RCX: ffff8801f83b9440 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8800baf18600 RBP: ffff8801f833fee8 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: ffffffff8127ceb3 R12: 0000000000000000 R13: ffff8800baf186b0 R14: ffff8801f83b99f8 R15: 00007faed746c700 FS: 0000000000000000(0000) GS:ffff88023fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007faf0966f028 CR3: 0000000001bc6000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Call Trace: do_exit+0x297/0xd10 SyS_exit+0x17/0x20 tracesys+0xdd/0xe2 Code: 00 00 66 66 66 66 90 55 48 89 e5 41 54 53 48 83 ec 58 48 8b 1f 48 85 db 75 11 eb 73 66 0f 1f 44 00 00 48 8b 5b 10 48 85 db 74 64 <4c> 8b a3 b8 00 00 00 4d 85 e4 74 eb 41 f6 84 24 2c 01 00 00 80 RIP [] userfaultfd_exit+0x29/0xa0 RSP ---[ end trace 9fecd6dcb442846a ]--- In the debugger I located the "mm" pointer in the stack and walking mm->mmap->vm_next through the end shows the vma->vm_next list is fully consistent and it is null terminated list as expected. So this has to be an SMP race condition where userfaultfd_exit was running while the vma list was being modified by another CPU. When userfaultfd_exit() run one of the ->vm_next pointers pointed to SLAB_POISON (RBX is the vma pointer and is 0x6b6b..). The reason is that it's not running in __mmput but while there are still other threads running and it's not holding the mmap_sem (it can't as it has to wait the even to be received by the manager). So this is an use after free that was happening for all processes. One more implementation problem aside from the race condition: userfaultfd_exit has really to check a flag in mm->flags before walking the vma or it's going to slowdown the exit() path for regular tasks. One more implementation problem: at that point signals can't be delivered so it would also create a task in D state if the manager doesn't read the event. The major design issue: it overall looks superfluous as the manager can check for -ENOSPC in the background transfer: if (mmget_not_zero(ctx->mm)) { [..] } else { return -ENOSPC; } It's safer to roll it back and re-introduce it later if at all. [rppt@linux.vnet.ibm.com: documentation fixup after removal of UFFD_EVENT_EXIT] Link: http://lkml.kernel.org/r/1488345437-4364-1-git-send-email-rppt@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20170224181957.19736-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Signed-off-by: Mike Rapoport Acked-by: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 0468548acebf..f2b79bf4c895 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -72,8 +72,6 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); -extern void userfaultfd_exit(struct mm_struct *mm); - #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -139,10 +137,6 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } -static inline void userfaultfd_exit(struct mm_struct *mm) -{ -} - #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ -- cgit v1.2.3 From cbfd0c1001bedb4b051cf4a1f5df24f1500381bc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 9 Mar 2017 16:16:57 -0800 Subject: include/linux/fs.h: fix unsigned enum warning with gcc-4.2 With arm-linux-gcc-4.2, almost every file we build in the kernel ends up with this warning: include/linux/fs.h:2648: warning: comparison of unsigned expression < 0 is always false Later versions don't have this problem, but it's easy enough to work around. Link: http://lkml.kernel.org/r/20161216105634.235457-12-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Russell King Cc: Brendan Gregg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index aad3fd0ff5f8..7251f7bb45e8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2678,7 +2678,7 @@ static const char * const kernel_read_file_str[] = { static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id) { - if (id < 0 || id >= READING_MAX_ID) + if ((unsigned)id >= READING_MAX_ID) return kernel_read_file_str[READING_UNKNOWN]; return kernel_read_file_str[id]; -- cgit v1.2.3 From ce9311cf95ad8baf044a014738d76973d93b739a Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Thu, 9 Mar 2017 16:17:00 -0800 Subject: mm/vmstats: add thp_split_pud event for clarity We added support for PUD-sized transparent hugepages, however we count the event "thp split pud" into thp_split_pmd event. To separate the event count of thp split pud from pmd, add a new event named thp_split_pud. Link: http://lkml.kernel.org/r/1488282380-5076-1-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Joonsoo Kim Cc: Sebastian Siewior Cc: Hugh Dickins Cc: Christoph Lameter Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: David Rientjes Cc: Hanjun Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 6aa1b6cb5828..a80b7b59cf33 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -79,6 +79,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + THP_SPLIT_PUD, +#endif THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif -- cgit v1.2.3 From 70ccb92fdd90b35bb6f9200093d4ffd6cb38156b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Mar 2017 16:17:11 -0800 Subject: userfaultfd: non-cooperative: userfaultfd_remove revalidate vma in MADV_DONTNEED userfaultfd_remove() has to be execute before zapping the pagetables or UFFDIO_COPY could keep filling pages after zap_page_range returned, which would result in non zero data after a MADV_DONTNEED. However userfaultfd_remove() may have to release the mmap_sem. This was handled correctly in MADV_REMOVE, but MADV_DONTNEED accessed a potentially stale vma (the very vma passed to zap_page_range(vma, ...)). The fix consists in revalidating the vma in case userfaultfd_remove() had to release the mmap_sem. This also optimizes away an unnecessary down_read/up_read in the MADV_REMOVE case if UFFD_EVENT_FORK had to be delivered. It all remains zero runtime cost in case CONFIG_USERFAULTFD=n as userfaultfd_remove() will be defined as "true" at build time. Link: http://lkml.kernel.org/r/20170302173738.18994-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f2b79bf4c895..48a3483dccb1 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -61,8 +61,7 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, unsigned long from, unsigned long to, unsigned long len); -extern void userfaultfd_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, +extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -118,11 +117,11 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, { } -static inline void userfaultfd_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, +static inline bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { + return true; } static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, -- cgit v1.2.3 From 40c50c1fecdf012a3bf055ec813f0ef2eda2749c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Mar 2017 13:17:18 +0100 Subject: kexec, x86/purgatory: Unbreak it and clean it up The purgatory code defines global variables which are referenced via a symbol lookup in the kexec code (core and arch). A recent commit addressing sparse warnings made these static and thereby broke kexec_file. Why did this happen? Simply because the whole machinery is undocumented and lacks any form of forward declarations. The variable names are unspecific and lack a prefix, so adding forward declarations creates shadow variables in the core code. Aside of that the code relies on magic constants and duplicate struct definitions with no way to ensure that these things stay in sync. The section placement of the purgatory variables happened by chance and not by design. Unbreak kexec and cleanup the mess: - Add proper forward declarations and document the usage - Use common struct definition - Use the proper common defines instead of magic constants - Add a purgatory_ prefix to have a proper name space - Use ARRAY_SIZE() instead of a homebrewn reimplementation - Add proper sections to the purgatory variables [ From Mike ] Fixes: 72042a8c7b01 ("x86/purgatory: Make functions and variables static") Reported-by: Mike Galbraith < Signed-off-by: Thomas Gleixner Cc: Nicholas Mc Guire Cc: Borislav Petkov Cc: Vivek Goyal Cc: "Tobin C. Harding" Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1703101315140.3681@nanos Signed-off-by: Thomas Gleixner --- include/linux/purgatory.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 include/linux/purgatory.h (limited to 'include/linux') diff --git a/include/linux/purgatory.h b/include/linux/purgatory.h new file mode 100644 index 000000000000..d60d4e278609 --- /dev/null +++ b/include/linux/purgatory.h @@ -0,0 +1,23 @@ +#ifndef _LINUX_PURGATORY_H +#define _LINUX_PURGATORY_H + +#include +#include +#include + +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +/* + * These forward declarations serve two purposes: + * + * 1) Make sparse happy when checking arch/purgatory + * 2) Document that these are required to be global so the symbol + * lookup in kexec works + */ +extern struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX]; +extern u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE]; + +#endif -- cgit v1.2.3