From 339e1e54891c339b30023c9cc8a005cbf65a3c0c Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Sat, 8 Oct 2016 16:59:38 +0800 Subject: clk: core: add __init decoration for CLK_OF_DECLARE_DRIVER function The new introduced macro CLK_OF_DECLARE_DRIVER is usually used to declare clock driver init functions, which are mostly decorated with __init. Add __init decoration for CLK_OF_DECLARE_DRIVER function to avoid causing section mismatch warnings on client clock drivers. Signed-off-by: Shawn Guo Fixes: c7296c51ce5d ("clk: core: New macro CLK_OF_DECLARE_DRIVER") Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index af596381fa0f..a428aec36ace 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -785,7 +785,7 @@ extern struct of_device_id __clk_of_table; * routines, one at of_clk_init(), and one at platform device probe */ #define CLK_OF_DECLARE_DRIVER(name, compat, fn) \ - static void name##_of_clk_init_driver(struct device_node *np) \ + static void __init name##_of_clk_init_driver(struct device_node *np) \ { \ of_node_clear_flag(np, OF_POPULATED); \ fn(np); \ -- cgit v1.2.3 From d33fd776f992332be110f6ceca6dad60cb5d513f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:51:28 +1100 Subject: iomap: add IOMAP_REPORT This allows the file system to tell a FIEMAP from a read operation, and thus avoids the need to report flags that aren't actually used in the read path. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- include/linux/iomap.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e63e288dee83..7892f55a1866 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -19,11 +19,15 @@ struct vm_fault; #define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */ /* - * Flags for iomap mappings: + * Flags for all iomap mappings: */ -#define IOMAP_F_MERGED 0x01 /* contains multiple blocks/extents */ -#define IOMAP_F_SHARED 0x02 /* block shared with another file */ -#define IOMAP_F_NEW 0x04 /* blocks have been newly allocated */ +#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ + +/* + * Flags that only need to be reported for IOMAP_REPORT requests: + */ +#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */ +#define IOMAP_F_SHARED 0x20 /* block shared with another file */ /* * Magic value for blkno: @@ -42,8 +46,9 @@ struct iomap { /* * Flags for iomap_begin / iomap_end. No flag implies a read. */ -#define IOMAP_WRITE (1 << 0) -#define IOMAP_ZERO (1 << 1) +#define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ +#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ +#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ struct iomap_ops { /* -- cgit v1.2.3 From f1caa61df2a3dc4c58316295c5dc5edba4c68d85 Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Mon, 24 Oct 2016 00:31:31 -0400 Subject: ACPI/PCI: pci_link: penalize SCI correctly Ondrej reported that IRQs stopped working in v4.7 on several platforms. A typical scenario, from Ondrej's VT82C694X/694X, is: ACPI: Using PIC for interrupt routing ACPI: PCI Interrupt Link [LNKA] (IRQs 1 3 4 5 6 7 10 *11 12 14 15) ACPI: No IRQ available for PCI Interrupt Link [LNKA] 8139too 0000:00:0f.0: PCI INT A: no GSI We're using PIC routing, so acpi_irq_balance == 0, and LNKA is already active at IRQ 11. In that case, acpi_pci_link_allocate() only tries to use the active IRQ (IRQ 11) which also happens to be the SCI. We should penalize the SCI by PIRQ_PENALTY_PCI_USING, but irq_get_trigger_type(11) returns something other than IRQ_TYPE_LEVEL_LOW, so we penalize it by PIRQ_PENALTY_ISA_ALWAYS instead, which makes acpi_pci_link_allocate() assume the IRQ isn't available and give up. Add acpi_penalize_sci_irq() so platforms can tell us the SCI IRQ, trigger, and polarity directly and we don't have to depend on irq_get_trigger_type(). Fixes: 103544d86976 (ACPI,PCI,IRQ: reduce resource requirements) Link: http://lkml.kernel.org/r/201609251512.05657.linux@rainbow-software.org Reported-by: Ondrej Zary Acked-by: Bjorn Helgaas Signed-off-by: Sinan Kaya Tested-by: Jonathan Liu Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index ddbeda6dbdc8..689a8b9b9c8f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -326,6 +326,7 @@ struct pci_dev; int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); bool acpi_isa_irq_available(int irq); +void acpi_penalize_sci_irq(int irq, int trigger, int polarity); void acpi_pci_irq_disable (struct pci_dev *dev); extern int ec_read(u8 addr, u8 *val); -- cgit v1.2.3 From 0d7317598214134d73da59990b846481a9527a00 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 24 Oct 2016 10:57:25 +0100 Subject: mm: unexport __get_user_pages() This patch unexports the low-level __get_user_pages() function. Recent refactoring of the get_user_pages* functions allow flags to be passed through get_user_pages() which eliminates the need for access to this function from its one user, kvm. We can see that the two calls to get_user_pages() which replace __get_user_pages() in kvm_main.c are equivalent by examining their call stacks: get_user_page_nowait(): get_user_pages(start, 1, flags, page, NULL) __get_user_pages_locked(current, current->mm, start, 1, page, NULL, NULL, false, flags | FOLL_TOUCH) __get_user_pages(current, current->mm, start, 1, flags | FOLL_TOUCH | FOLL_GET, page, NULL, NULL) check_user_page_hwpoison(): get_user_pages(addr, 1, flags, NULL, NULL) __get_user_pages_locked(current, current->mm, addr, 1, NULL, NULL, NULL, false, flags | FOLL_TOUCH) __get_user_pages(current, current->mm, addr, 1, flags | FOLL_TOUCH, NULL, NULL, NULL) Signed-off-by: Lorenzo Stoakes Acked-by: Paolo Bonzini Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a191853faaa..a92c8d73aeaf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1271,10 +1271,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void * extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int foll_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, -- cgit v1.2.3 From 8ef4227615e158faa4ee85a1d6466782f7e22f2f Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 24 Oct 2016 15:27:59 +1000 Subject: x86/io: add interface to reserve io memtype for a resource range. (v1.1) A recent change to the mm code in: 87744ab3832b mm: fix cache mode tracking in vm_insert_mixed() started enforcing checking the memory type against the registered list for amixed pfn insertion mappings. It happens that the drm drivers for a number of gpus relied on this being broken. Currently the driver only inserted VRAM mappings into the tracking table when they came from the kernel, and userspace mappings never landed in the table. This led to a regression where all the mapping end up as UC instead of WC now. I've considered a number of solutions but since this needs to be fixed in fixes and not next, and some of the solutions were going to introduce overhead that hadn't been there before I didn't consider them viable at this stage. These mainly concerned hooking into the TTM io reserve APIs, but these API have a bunch of fast paths I didn't want to unwind to add this to. The solution I've decided on is to add a new API like the arch_phys_wc APIs (these would have worked but wc_del didn't take a range), and use them from the drivers to add a WC compatible mapping to the table for all VRAM on those GPUs. This means we can then create userspace mapping that won't get degraded to UC. v1.1: use CONFIG_X86_PAT + add some comments in io.h Cc: Toshi Kani Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: x86@kernel.org Cc: mcgrof@suse.com Cc: Dan Williams Acked-by: Ingo Molnar Reviewed-by: Thomas Gleixner Signed-off-by: Dave Airlie --- include/linux/io.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index e2c8419278c1..82ef36eac8a1 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -141,4 +141,26 @@ enum { void *memremap(resource_size_t offset, size_t size, unsigned long flags); void memunmap(void *addr); +/* + * On x86 PAT systems we have memory tracking that keeps track of + * the allowed mappings on memory ranges. This tracking works for + * all the in-kernel mapping APIs (ioremap*), but where the user + * wishes to map a range from a physical device into user memory + * the tracking won't be updated. This API is to be used by + * drivers which remap physical device pages into userspace, + * and wants to make sure they are mapped WC and not UC. + */ +#ifndef arch_io_reserve_memtype_wc +static inline int arch_io_reserve_memtype_wc(resource_size_t base, + resource_size_t size) +{ + return 0; +} + +static inline void arch_io_free_memtype_wc(resource_size_t base, + resource_size_t size) +{ +} +#endif + #endif /* _LINUX_IO_H */ -- cgit v1.2.3 From 9dcb8b685fc30813b35ab4b4bf39244430753190 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 26 Oct 2016 10:15:30 -0700 Subject: mm: remove per-zone hashtable of bitlock waitqueues The per-zone waitqueues exist because of a scalability issue with the page waitqueues on some NUMA machines, but it turns out that they hurt normal loads, and now with the vmalloced stacks they also end up breaking gfs2 that uses a bit_wait on a stack object: wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE) where 'gh' can be a reference to the local variable 'mount_gh' on the stack of fill_super(). The reason the per-zone hash table breaks for this case is that there is no "zone" for virtual allocations, and trying to look up the physical page to get at it will fail (with a BUG_ON()). It turns out that I actually complained to the mm people about the per-zone hash table for another reason just a month ago: the zone lookup also hurts the regular use of "unlock_page()" a lot, because the zone lookup ends up forcing several unnecessary cache misses and generates horrible code. As part of that earlier discussion, we had a much better solution for the NUMA scalability issue - by just making the page lock have a separate contention bit, the waitqueue doesn't even have to be looked at for the normal case. Peter Zijlstra already has a patch for that, but let's see if anybody even notices. In the meantime, let's fix the actual gfs2 breakage by simplifying the bitlock waitqueues and removing the per-zone issue. Reported-by: Andreas Gruenbacher Tested-by: Bob Peterson Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Steven Whitehouse Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7f2ae99e5daf..0f088f3a2fed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -440,33 +440,7 @@ struct zone { seqlock_t span_seqlock; #endif - /* - * wait_table -- the array holding the hash table - * wait_table_hash_nr_entries -- the size of the hash table array - * wait_table_bits -- wait_table_size == (1 << wait_table_bits) - * - * The purpose of all these is to keep track of the people - * waiting for a page to become available and make them - * runnable again when possible. The trouble is that this - * consumes a lot of space, especially when so few things - * wait on pages at a given time. So instead of using - * per-page waitqueues, we use a waitqueue hash table. - * - * The bucket discipline is to sleep on the same queue when - * colliding and wake all in that wait queue when removing. - * When something wakes, it must check to be sure its page is - * truly available, a la thundering herd. The cost of a - * collision is great, but given the expected load of the - * table, they should be so rare as to be outweighed by the - * benefits from the saved space. - * - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the - * primary users of these fields, and in mm/page_alloc.c - * free_area_init_core() performs the initialization of them. - */ - wait_queue_head_t *wait_table; - unsigned long wait_table_hash_nr_entries; - unsigned long wait_table_bits; + int initialized; /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) @@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) static inline bool zone_is_initialized(struct zone *zone) { - return !!zone->wait_table; + return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) -- cgit v1.2.3 From c0a0aba8e478229b2f0956918542152fbad3f794 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 27 Oct 2016 17:46:38 -0700 Subject: kconfig.h: remove config_enabled() macro The use of config_enabled() is ambiguous. For config options, IS_ENABLED(), IS_REACHABLE(), etc. will make intention clearer. Sometimes config_enabled() has been used for non-config options because it is useful to check whether the given symbol is defined or not. I have been tackling on deprecating config_enabled(), and now is the time to finish this work. Some new users have appeared for v4.9-rc1, but it is trivial to replace them: - arch/x86/mm/kaslr.c replace config_enabled() with IS_ENABLED() because CONFIG_X86_ESPFIX64 and CONFIG_EFI are boolean. - include/asm-generic/export.h replace config_enabled() with __is_defined(). Then, config_enabled() can be removed now. Going forward, please use IS_ENABLED(), IS_REACHABLE(), etc. for config options, and __is_defined() for non-config symbols. Link: http://lkml.kernel.org/r/1476616078-32252-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Ingo Molnar Acked-by: Nicolas Pitre Cc: Peter Oberparleiter Cc: Arnd Bergmann Cc: Kees Cook Cc: Michal Marek Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Thomas Garnier Cc: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kconfig.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index 15ec117ec537..8f2e059e4d45 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -31,7 +31,6 @@ * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when * the last step cherry picks the 2nd arg, we get a zero. */ -#define config_enabled(cfg) ___is_defined(cfg) #define __is_defined(x) ___is_defined(x) #define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) #define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) @@ -41,13 +40,13 @@ * otherwise. For boolean options, this is equivalent to * IS_ENABLED(CONFIG_FOO). */ -#define IS_BUILTIN(option) config_enabled(option) +#define IS_BUILTIN(option) __is_defined(option) /* * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0 * otherwise. */ -#define IS_MODULE(option) config_enabled(option##_MODULE) +#define IS_MODULE(option) __is_defined(option##_MODULE) /* * IS_REACHABLE(CONFIG_FOO) evaluates to 1 if the currently compiled -- cgit v1.2.3 From 5aab90ce1ec449912a2ebc4d45e0c85dac29e9dd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Oct 2016 11:48:24 +0200 Subject: perf/powerpc: Don't call perf_event_disable() from atomic context The trinity syscall fuzzer triggered following WARN() on powerpc: WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278 ... NIP [c00000000093aedc] .hw_breakpoint_handler+0x28c/0x2b0 LR [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 Call Trace: [c0000002f7933580] [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 (unreliable) [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 Followed by a lockdep warning: =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc5+ #7 Tainted: G W ------------------------------- ./include/linux/rcupdate.h:556 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by ls/2998: #0: (rcu_read_lock){......}, at: [] .__atomic_notifier_call_chain+0x0/0x1c0 #1: (rcu_read_lock){......}, at: [] .hw_breakpoint_handler+0x0/0x2b0 stack backtrace: CPU: 9 PID: 2998 Comm: ls Tainted: G W 4.8.0-rc5+ #7 Call Trace: [c0000002f7933150] [c00000000094b1f8] .dump_stack+0xe0/0x14c (unreliable) [c0000002f79331e0] [c00000000013c468] .lockdep_rcu_suspicious+0x138/0x180 [c0000002f7933270] [c0000000001005d8] .___might_sleep+0x278/0x2e0 [c0000002f7933300] [c000000000935584] .mutex_lock_nested+0x64/0x5a0 [c0000002f7933410] [c00000000023084c] .perf_event_ctx_lock_nested+0x16c/0x380 [c0000002f7933500] [c000000000230a80] .perf_event_disable+0x20/0x60 [c0000002f7933580] [c00000000093aeec] .hw_breakpoint_handler+0x29c/0x2b0 [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 While it looks like the first WARN() is probably valid, the other one is triggered by disabling event via perf_event_disable() from atomic context. The event is disabled here in case we were not able to emulate the instruction that hit the breakpoint. By disabling the event we unschedule the event and make sure it's not scheduled back. But we can't call perf_event_disable() from atomic context, instead we need to use the event's pending_disable irq_work method to disable it. Reported-by: Jan Stancek Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Huang Ying Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Neuling Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026094824.GA21397@krava Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 060d0ede88df..4741ecdb9817 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1257,6 +1257,7 @@ extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); +extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * -- cgit v1.2.3