From e93ad19d05648397ef3bcb838d26aec06c245dc0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Jan 2016 12:18:41 -0500 Subject: cpuset: make mm migration asynchronous If "cpuset.memory_migrate" is set, when a process is moved from one cpuset to another with a different memory node mask, pages in used by the process are migrated to the new set of nodes. This was performed synchronously in the ->attach() callback, which is synchronized against process management. Recently, the synchronization was changed from per-process rwsem to global percpu rwsem for simplicity and optimization. Combined with the synchronous mm migration, this led to deadlocks because mm migration could schedule a work item which may in turn try to create a new worker blocking on the process management lock held from cgroup process migration path. This heavy an operation shouldn't be performed synchronously from that deep inside cgroup migration in the first place. This patch punts the actual migration to an ordered workqueue and updates cgroup process migration and cpuset config update paths to flush the workqueue after all locks are released. This way, the operations still seem synchronous to userland without entangling mm migration with process management synchronization. CPU hotplug can also invoke mm migration but there's no reason for it to wait for mm migrations and thus doesn't synchronize against their completions. Signed-off-by: Tejun Heo Reported-and-tested-by: Christian Borntraeger Cc: stable@vger.kernel.org # v4.4+ --- include/linux/cpuset.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 85a868ccb493..fea160ee5803 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } +extern void cpuset_post_attach_flush(void); + #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } +static inline void cpuset_post_attach_flush(void) +{ +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ -- cgit v1.2.3 From aa226ff4a1ce79f229c6b7a4c0a14e17fececd01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Jan 2016 15:31:11 -0500 Subject: cgroup: make sure a parent css isn't offlined before its children There are three subsystem callbacks in css shutdown path - css_offline(), css_released() and css_free(). Except for css_released(), cgroup core didn't guarantee the order of invocation. css_offline() or css_free() could be called on a parent css before its children. This behavior is unexpected and led to bugs in cpu and memory controller. This patch updates offline path so that a parent css is never offlined before its children. Each css keeps online_cnt which reaches zero iff itself and all its children are offline and offline_css() is invoked only after online_cnt reaches zero. This fixes the memory controller bug and allows the fix for cpu controller. Signed-off-by: Tejun Heo Reported-and-tested-by: Christian Borntraeger Reported-by: Brian Christiansen Link: http://lkml.kernel.org/g/5698A023.9070703@de.ibm.com Link: http://lkml.kernel.org/g/CAKB58ikDkzc8REt31WBkD99+hxNzjK4+FBmhkgS+NVrC9vjMSg@mail.gmail.com Cc: Heiko Carstens Cc: Peter Zijlstra Cc: stable@vger.kernel.org --- include/linux/cgroup-defs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7f540f7f588d..789471dba6fb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -127,6 +127,12 @@ struct cgroup_subsys_state { */ u64 serial_nr; + /* + * Incremented by online self and children. Used to guarantee that + * parents are not offlined before their children. + */ + atomic_t online_cnt; + /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; -- cgit v1.2.3 From fb3296335500aaff61333df8eabbccf28761c79d Mon Sep 17 00:00:00 2001 From: Danesh Petigara Date: Mon, 11 Jan 2016 13:22:26 -0800 Subject: drivers: ata: wake port before DMA stop for ALPM The AHCI driver code stops and starts port DMA engines at will without considering the power state of the particular port. The AHCI specification isn't very clear on how to handle this scenario, leaving implementation open to interpretation. Broadcom's STB SATA host controller is unable to handle port DMA controller restarts when the port in question is in low power mode. When a port enters partial or slumber mode, its PHY is powered down. When a controller restart is requested, the controller's internal state machine expects the PHY to be brought back up by software which never happens in this case, resulting in failures. To avoid this situation, logic is added to manually wake up the port just before its DMA engine is stopped, if the port happens to be in a low power state. HBA initiated power management ensures that the port eventually returns to its configured low power state, when the link is idle (as per the conditions listed in the spec). A new host flag is also added to ensure this logic is only exercised for hosts with the above limitation. tj: Formatting changes. Signed-off-by: Danesh Petigara Reviewed-by: Markus Mayer Signed-off-by: Tejun Heo --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 851821bfd553..bec2abbd7ab2 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -526,6 +526,7 @@ enum ata_lpm_policy { enum ata_lpm_hints { ATA_LPM_EMPTY = (1 << 0), /* port empty/probing */ ATA_LPM_HIPM = (1 << 1), /* may use HIPM */ + ATA_LPM_WAKE_ONLY = (1 << 2), /* only wake up link */ }; /* forward declarations */ -- cgit v1.2.3 From 23d11a58a9a60dcb52c8fc6494efce908b24c295 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Jan 2016 05:59:46 -0500 Subject: workqueue: skip flush dependency checks for legacy workqueues fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue") implemented flush dependency warning which triggers if a PF_MEMALLOC task or WQ_MEM_RECLAIM workqueue tries to flush a !WQ_MEM_RECLAIM workquee. This assumes that workqueues marked with WQ_MEM_RECLAIM sit in memory reclaim path and making it depend on something which may need more memory to make forward progress can lead to deadlocks. Unfortunately, workqueues created with the legacy create*_workqueue() interface always have WQ_MEM_RECLAIM regardless of whether they are depended upon memory reclaim or not. These spurious WQ_MEM_RECLAIM markings cause spurious triggering of the flush dependency checks. WARNING: CPU: 0 PID: 6 at kernel/workqueue.c:2361 check_flush_dependency+0x138/0x144() workqueue: WQ_MEM_RECLAIM deferwq:deferred_probe_work_func is flushing !WQ_MEM_RECLAIM events:lru_add_drain_per_cpu ... Workqueue: deferwq deferred_probe_work_func [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x94/0xd4) [] (dump_stack) from [] (warn_slowpath_common+0x80/0xb0) [] (warn_slowpath_common) from [] (warn_slowpath_fmt+0x30/0x40) [] (warn_slowpath_fmt) from [] (check_flush_dependency+0x138/0x144) [] (check_flush_dependency) from [] (flush_work+0x50/0x15c) [] (flush_work) from [] (lru_add_drain_all+0x130/0x180) [] (lru_add_drain_all) from [] (migrate_prep+0x8/0x10) [] (migrate_prep) from [] (alloc_contig_range+0xd8/0x338) [] (alloc_contig_range) from [] (cma_alloc+0xe0/0x1ac) [] (cma_alloc) from [] (__alloc_from_contiguous+0x38/0xd8) [] (__alloc_from_contiguous) from [] (__dma_alloc+0x240/0x278) [] (__dma_alloc) from [] (arm_dma_alloc+0x54/0x5c) [] (arm_dma_alloc) from [] (dmam_alloc_coherent+0xc0/0xec) [] (dmam_alloc_coherent) from [] (ahci_port_start+0x150/0x1dc) [] (ahci_port_start) from [] (ata_host_start.part.3+0xc8/0x1c8) [] (ata_host_start.part.3) from [] (ata_host_activate+0x50/0x148) [] (ata_host_activate) from [] (ahci_host_activate+0x44/0x114) [] (ahci_host_activate) from [] (ahci_platform_init_host+0x1d8/0x3c8) [] (ahci_platform_init_host) from [] (tegra_ahci_probe+0x448/0x4e8) [] (tegra_ahci_probe) from [] (platform_drv_probe+0x50/0xac) [] (platform_drv_probe) from [] (driver_probe_device+0x214/0x2c0) [] (driver_probe_device) from [] (bus_for_each_drv+0x60/0x94) [] (bus_for_each_drv) from [] (__device_attach+0xb0/0x114) [] (__device_attach) from [] (bus_probe_device+0x84/0x8c) [] (bus_probe_device) from [] (deferred_probe_work_func+0x68/0x98) [] (deferred_probe_work_func) from [] (process_one_work+0x120/0x3f8) [] (process_one_work) from [] (worker_thread+0x38/0x55c) [] (worker_thread) from [] (kthread+0xdc/0xf4) [] (kthread) from [] (ret_from_fork+0x14/0x3c) Fix it by marking workqueues created via create*_workqueue() with __WQ_LEGACY and disabling flush dependency checks on them. Signed-off-by: Tejun Heo Reported-and-tested-by: Thierry Reding Link: http://lkml.kernel.org/g/20160126173843.GA11115@ulmo.nvidia.com Fixes: fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue") --- include/linux/workqueue.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0e32bc71245e..ca73c503b92a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -311,6 +311,7 @@ enum { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ + __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ @@ -411,12 +412,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ - alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ - alloc_workqueue("%s", WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, \ - 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ + WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ - alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name) + alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) extern void destroy_workqueue(struct workqueue_struct *wq); -- cgit v1.2.3 From 8244062ef1e54502ef55f54cced659913f244c3e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 3 Feb 2016 16:55:26 +1030 Subject: modules: fix longstanding /proc/kallsyms vs module insertion race. For CONFIG_KALLSYMS, we keep two symbol tables and two string tables. There's one full copy, marked SHF_ALLOC and laid out at the end of the module's init section. There's also a cut-down version that only contains core symbols and strings, and lives in the module's core section. After module init (and before we free the module memory), we switch the mod->symtab, mod->num_symtab and mod->strtab to point to the core versions. We do this under the module_mutex. However, kallsyms doesn't take the module_mutex: it uses preempt_disable() and rcu tricks to walk through the modules, because it's used in the oops path. It's also used in /proc/kallsyms. There's nothing atomic about the change of these variables, so we can get the old (larger!) num_symtab and the new symtab pointer; in fact this is what I saw when trying to reproduce. By grouping these variables together, we can use a carefully-dereferenced pointer to ensure we always get one or the other (the free of the module init section is already done in an RCU callback, so that's safe). We allocate the init one at the end of the module init section, and keep the core one inside the struct module itself (it could also have been allocated at the end of the module core, but that's probably overkill). Reported-by: Weilong Chen Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111541 Cc: stable@kernel.org Signed-off-by: Rusty Russell --- include/linux/module.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 4560d8f1545d..2bb0c3085706 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -324,6 +324,12 @@ struct module_layout { #define __module_layout_align #endif +struct mod_kallsyms { + Elf_Sym *symtab; + unsigned int num_symtab; + char *strtab; +}; + struct module { enum module_state state; @@ -405,15 +411,10 @@ struct module { #endif #ifdef CONFIG_KALLSYMS - /* - * We keep the symbol and string tables for kallsyms. - * The core_* fields below are temporary, loader-only (they - * could really be discarded after module init). - */ - Elf_Sym *symtab, *core_symtab; - unsigned int num_symtab, core_num_syms; - char *strtab, *core_strtab; - + /* Protected by RCU and/or module_mutex: use rcu_dereference() */ + struct mod_kallsyms *kallsyms; + struct mod_kallsyms core_kallsyms; + /* Section attributes */ struct module_sect_attrs *sect_attrs; -- cgit v1.2.3 From 0fb5b1fb30fba3671dd5b1489d78e93e08d62e4e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 4 Feb 2016 00:52:12 -0500 Subject: block/sd: Return -EREMOTEIO when WRITE SAME and DISCARD are disabled When a storage device rejects a WRITE SAME command we will disable write same functionality for the device and return -EREMOTEIO to the block layer. -EREMOTEIO will in turn prevent DM from retrying the I/O and/or failing the path. Yiwen Jiang discovered a small race where WRITE SAME requests issued simultaneously would cause -EIO to be returned. This happened because any requests being prepared after WRITE SAME had been disabled for the device caused us to return BLKPREP_KILL. The latter caused the block layer to return -EIO upon completion. To overcome this we introduce BLKPREP_INVALID which indicates that this is an invalid request for the device. blk_peek_request() is modified to return -EREMOTEIO in that case. Reported-by: Yiwen Jiang Suggested-by: Mike Snitzer Reviewed-by: Hannes Reinicke Reviewed-by: Ewan Milne Reviewed-by: Yiwen Jiang Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d372ea87ead5..a9b643a0647f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -681,9 +681,12 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) /* * q->prep_rq_fn return values */ -#define BLKPREP_OK 0 /* serve it */ -#define BLKPREP_KILL 1 /* fatal error, kill */ -#define BLKPREP_DEFER 2 /* leave on queue */ +enum { + BLKPREP_OK, /* serve it */ + BLKPREP_KILL, /* fatal error, kill, return -EIO */ + BLKPREP_DEFER, /* leave on queue */ + BLKPREP_INVALID, /* invalid command, kill, return -EREMOTEIO */ +}; extern unsigned long blk_max_low_pfn, blk_max_pfn; -- cgit v1.2.3 From 5f74f82ea34c0da80ea0b49192bb5ea06e063593 Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Wed, 3 Feb 2016 09:26:57 +0100 Subject: net:Add sysctl_max_skb_frags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devices may have limits on the number of fragments in an skb they support. Current codebase uses a constant as maximum for number of fragments one skb can hold and use. When enabling scatter/gather and running traffic with many small messages the codebase uses the maximum number of fragments and may thereby violate the max for certain devices. The patch introduces a global variable as max number of fragments. Signed-off-by: Hans Westgaard Ry Reviewed-by: HÃ¥kon Bugge Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 11f935c1a090..4ce9ff7086f4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -299,6 +299,7 @@ struct sk_buff; #else #define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1) #endif +extern int sysctl_max_skb_frags; typedef struct skb_frag_struct skb_frag_t; -- cgit v1.2.3 From 4a389810bc3cb0e73443104f0827e81e23cb1e12 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Feb 2016 16:13:14 -0800 Subject: kernel/locking/lockdep.c: convert hash tables to hlists Mike said: : CONFIG_UBSAN_ALIGNMENT breaks x86-64 kernel with lockdep enabled, i. e : kernel with CONFIG_UBSAN_ALIGNMENT fails to load without even any error : message. : : The problem is that ubsan callbacks use spinlocks and might be called : before lockdep is initialized. Particularly this line in the : reserve_ebda_region function causes problem: : : lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); : : If i put lockdep_init() before reserve_ebda_region call in : x86_64_start_reservations kernel loads well. Fix this ordering issue permanently: change lockdep so that it uses hlists for the hash tables. Unlike a list_head, an hlist_head is in its initialized state when it is all-zeroes, so lockdep is ready for operation immediately upon boot - lockdep_init() need not have run. The patch will also save some memory. lockdep_init() and lockdep_initialized can be done away with now - a 4.6 patch has been prepared to do this. Reported-by: Mike Krinkin Suggested-by: Mike Krinkin Cc: Andrey Ryabinin Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index c57e424d914b..4dca42fd32f5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -66,7 +66,7 @@ struct lock_class { /* * class-hash: */ - struct list_head hash_entry; + struct hlist_node hash_entry; /* * global list of all lock-classes: @@ -199,7 +199,7 @@ struct lock_chain { u8 irq_context; u8 depth; u16 base; - struct list_head entry; + struct hlist_node entry; u64 chain_key; }; -- cgit v1.2.3 From db78c22230d0bcc8b27b81f05b39f104f08232c5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 11 Feb 2016 16:13:17 -0800 Subject: mm: fix pfn_t vs highmem The pfn_t type uses an unsigned long to store a pfn + flags value. On a 64-bit platform the upper 12 bits of an unsigned long are never used for storing the value of a pfn. However, this is not true on highmem platforms, all 32-bits of a pfn value are used to address a 44-bit physical address space. A pfn_t needs to store a 64-bit value. Link: https://bugzilla.kernel.org/show_bug.cgi?id=112211 Fixes: 01c8f1c44b83 ("mm, dax, gpu: convert vm_insert_mixed to pfn_t") Signed-off-by: Dan Williams Reported-by: Stuart Foster Reported-by: Julian Margetson Tested-by: Julian Margetson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn.h | 2 +- include/linux/pfn_t.h | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pfn.h b/include/linux/pfn.h index 2d8e49711b63..1132953235c0 100644 --- a/include/linux/pfn.h +++ b/include/linux/pfn.h @@ -10,7 +10,7 @@ * backing is indicated by flags in the high bits of the value. */ typedef struct { - unsigned long val; + u64 val; } pfn_t; #endif diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 37448ab5fb5c..94994810c7c0 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -9,14 +9,13 @@ * PFN_DEV - pfn is not covered by system memmap by default * PFN_MAP - pfn has a dynamic page mapping established by a device driver */ -#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \ - << (BITS_PER_LONG - PAGE_SHIFT)) -#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1)) -#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2)) -#define PFN_DEV (1UL << (BITS_PER_LONG - 3)) -#define PFN_MAP (1UL << (BITS_PER_LONG - 4)) - -static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags) +#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) +#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) +#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2)) +#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) +#define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4)) + +static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags) { pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), }; @@ -29,7 +28,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags); +extern pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags); static inline bool pfn_t_has_page(pfn_t pfn) { @@ -87,7 +86,7 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot) #ifdef __HAVE_ARCH_PTE_DEVMAP static inline bool pfn_t_devmap(pfn_t pfn) { - const unsigned long flags = PFN_DEV|PFN_MAP; + const u64 flags = PFN_DEV|PFN_MAP; return (pfn.val & flags) == flags; } -- cgit v1.2.3