From f872f5400cc01373d8e29d9c7a5296ccfaf4ccf3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:19 -0800 Subject: mm: Add a vm_special_mapping.fault() method Requiring special mappings to give a list of struct pages is inflexible: it prevents sane use of IO memory in a special mapping, it's inefficient (it requires arch code to initialize a list of struct pages, and it requires the mm core to walk the entire list just to figure out how long it is), and it prevents arch code from doing anything fancy when a special mapping fault occurs. Add a .fault method as an alternative to filling in a .pages array. Looks-OK-to: Andrew Morton Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a26d1677c0bc7e774c33f469451a78ca31e9e6af.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- include/linux/mm_types.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f8d1492a114f..c88e48a3c155 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -568,10 +568,26 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm) } #endif -struct vm_special_mapping -{ - const char *name; +struct vm_fault; + +struct vm_special_mapping { + const char *name; /* The name, e.g. "[vdso]". */ + + /* + * If .fault is not provided, this points to a + * NULL-terminated array of pages that back the special mapping. + * + * This must not be NULL unless .fault is provided. + */ struct page **pages; + + /* + * If non-NULL, then this is called to resolve page faults + * on the special mapping. If used, .pages is not checked. + */ + int (*fault)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, + struct vm_fault *vmf); }; enum tlb_flush_reason { -- cgit v1.2.3 From 1745cbc5d0dee0749a6bc0ea8e872c5db0074061 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:20 -0800 Subject: mm: Add vm_insert_pfn_prot() The x86 vvar vma contains pages with differing cacheability flags. x86 currently implements this by manually inserting all the ptes using (io_)remap_pfn_range when the vma is set up. x86 wants to move to using .fault with VM_FAULT_NOPAGE to set up the mappings as needed. The correct API to use to insert a pfn in .fault is vm_insert_pfn(), but vm_insert_pfn() can't override the vma's cache mode, and the HPET page in particular needs to be uncached despite the fact that the rest of the VMA is cached. Add vm_insert_pfn_prot() to support varying cacheability within the same non-COW VMA in a more sane manner. x86 could alternatively use multiple VMAs, but that's messy, would break CRIU, and would create unnecessary VMAs that would waste memory. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Acked-by: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d2938d1eb37be7a5e4f86182db646551f11e45aa.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 00bad7793788..87ef1d7730ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2080,6 +2080,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); -- cgit v1.2.3 From dd42ac8f02aea32661756554aace2095f7181d34 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 16 Oct 2015 15:20:53 +0600 Subject: clockevents: Rename last parameter of clocks_calc_mult_shift() to maxsec Last parameter of the clocks_calc_mult_shift() was renamed from minsec to maxsec in the 5fdade95 (time: Rename misnamed minsec argument of clocks_calc_mult_shift()). Signed-off-by: Alexander Kuleshov Link: http://lkml.kernel.org/r/1444987253-11018-1-git-send-email-kuleshovmail@gmail.com Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index bdcf358dfce2..0d442e34c349 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -190,9 +190,9 @@ extern void clockevents_config_and_register(struct clock_event_device *dev, extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); static inline void -clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec) +clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec) { - return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, minsec); + return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec); } extern void clockevents_suspend(void); -- cgit v1.2.3 From 9c808765e88efb6fa6af7e2206ef89512f1840a7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jan 2016 17:41:08 +0000 Subject: hrtimer: Add support for CLOCK_MONOTONIC_RAW The KVM/ARM timer implementation arms a hrtimer when a vcpu is blocked (usually because it is waiting for an interrupt) while its timer is going to kick in the future. It is essential that this timer doesn't get adjusted, or the guest will end up being woken-up at the wrong time (NTP running on the host seems to confuse the hell out of some guests). In order to allow this, let's add CLOCK_MONOTONIC_RAW support to hrtimer (it is so far only supported for posix timers). It also has the (limited) benefit of fixing de0421d53bfb ("mac80211_hwsim: shuffle code to prepare for dynamic radios"), which already uses this functionnality without realizing wasn't implemented (just being lucky...). Signed-off-by: Marc Zyngier Cc: Tomasz Nowicki Cc: Christoffer Dall Link: http://lkml.kernel.org/r/1452879670-16133-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..a6d64af5e73f 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,6 +151,7 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, + HRTIMER_BASE_MONOTONIC_RAW, HRTIMER_MAX_CLOCK_BASES, }; -- cgit v1.2.3 From cb150b9d23be6ee7f3a0fff29784f1c5b5ac514d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 27 Jan 2016 13:29:34 +0100 Subject: cfg80211/wext: fix message ordering Since cfg80211 frequently takes actions from its netdev notifier call, wireless extensions messages could still be ordered badly since the wext netdev notifier, since wext is built into the kernel, runs before the cfg80211 netdev notifier. For example, the following can happen: 5: wlan1: mtu 1500 qdisc mq state DOWN group default link/ether 02:00:00:00:01:00 brd ff:ff:ff:ff:ff:ff 5: wlan1: link/ether when setting the interface down causes the wext message. To also fix this, export the wireless_nlevent_flush() function and also call it from the cfg80211 notifier. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index 8f81bbbc38fc..e0f4109e64c6 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -439,6 +439,12 @@ int dev_get_wireless_info(char *buffer, char **start, off_t offset, int length); /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); +#ifdef CONFIG_WEXT_CORE +/* flush all previous wext events - if work is done from netdev notifiers */ +void wireless_nlevent_flush(void); +#else +static inline void wireless_nlevent_flush(void) {} +#endif /* We may need a function to send a stream of events to user space. * More on that later... */ -- cgit v1.2.3 From 9babd5c8caa6e62c116efc3a64a09f65af4112b0 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:17 +0100 Subject: resource: Add System RAM resource type The IORESOURCE_MEM I/O resource type is used for all types of memory-mapped ranges, ex. System RAM, System ROM, Video RAM, Persistent Memory, PCI Bus, PCI MMCONFIG, ACPI Tables, IOAPIC, reserved, and so on. This requires walk_system_ram_range(), walk_system_ram_res(), and region_intersects() to use strcmp() against string "System RAM" to search for System RAM ranges in the iomem table, which is inefficient. __ioremap_caller() and reserve_memtype() on x86, for instance, call walk_system_ram_range() for every request to check if a given range is in System RAM ranges. However, adding a new I/O resource type for System RAM is not a viable option, see [1]. There are approx. 3800 references to IORESOURCE_MEM in the kernel/drivers, which makes it very difficult to distinguish their usages between new type and IORESOURCE_MEM. The I/O resource types are also used by the PNP subsystem. Therefore, introduce an extended I/O resource type, IORESOURCE_SYSTEM_RAM, which consists of IORESOURCE_MEM and a new modifier flag IORESOURCE_SYSRAM, see [2]. To keep the code 'if (resource_type(r) == IORESOURCE_MEM)' still working for System RAM, resource_ext_type() is added for extracting extended type bits. Link[1]: http://lkml.kernel.org/r/1449168859.9855.54.camel@hpe.com Link[2]: http://lkml.kernel.org/r/CA+55aFy4WQrWexC4u2LxX9Mw2NVoznw7p3Yh=iF4Xtf7zKWnRw@mail.gmail.com Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Hanjun Guo Cc: Jakub Sitnicki Cc: Jiang Liu Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/ioport.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 24bea087e7af..4b65d944717f 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -49,12 +49,19 @@ struct resource { #define IORESOURCE_WINDOW 0x00200000 /* forwarded by bridge */ #define IORESOURCE_MUXED 0x00400000 /* Resource is software muxed */ +#define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */ +#define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */ + #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ + #define IORESOURCE_DISABLED 0x10000000 #define IORESOURCE_UNSET 0x20000000 /* No address assigned yet */ #define IORESOURCE_AUTO 0x40000000 #define IORESOURCE_BUSY 0x80000000 /* Driver has marked this resource busy */ +/* I/O resource extended types */ +#define IORESOURCE_SYSTEM_RAM (IORESOURCE_MEM|IORESOURCE_SYSRAM) + /* PnP IRQ specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IRQ_HIGHEDGE (1<<0) #define IORESOURCE_IRQ_LOWEDGE (1<<1) @@ -170,6 +177,10 @@ static inline unsigned long resource_type(const struct resource *res) { return res->flags & IORESOURCE_TYPE_BITS; } +static inline unsigned long resource_ext_type(const struct resource *res) +{ + return res->flags & IORESOURCE_EXT_TYPE_BITS; +} /* True iff r1 completely contains r2 */ static inline bool resource_contains(struct resource *r1, struct resource *r2) { -- cgit v1.2.3 From 43ee493bde78da00deaf5737925365c691a036ad Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:19 +0100 Subject: resource: Add I/O resource descriptor walk_iomem_res() and region_intersects() still need to use strcmp() for searching a resource entry by @name in the iomem table. This patch introduces I/O resource descriptor 'desc' in struct resource for the iomem search interfaces. Drivers can assign their unique descriptor to a range when they support the search interfaces. Otherwise, 'desc' is set to IORES_DESC_NONE (0). This avoids changing most of the drivers as they typically allocate resource entries statically, or by calling alloc_resource(), kzalloc(), or alloc_bootmem_low(), which set the field to zero by default. A later patch will address some drivers that use kmalloc() without zero'ing the field. Also change release_mem_region_adjustable() to set 'desc' when its resource entry gets separated. Other resource interfaces are also changed to initialize 'desc' explicitly although alloc_resource() sets it to 0. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jakub Sitnicki Cc: Jiang Liu Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/ioport.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 4b65d944717f..983bea05d69c 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -20,6 +20,7 @@ struct resource { resource_size_t end; const char *name; unsigned long flags; + unsigned long desc; struct resource *parent, *sibling, *child; }; @@ -112,6 +113,22 @@ struct resource { /* PCI control bits. Shares IORESOURCE_BITS with above PCI ROM. */ #define IORESOURCE_PCI_FIXED (1<<4) /* Do not move resource */ +/* + * I/O Resource Descriptors + * + * Descriptors are used by walk_iomem_res_desc() and region_intersects() + * for searching a specific resource range in the iomem table. Assign + * a new descriptor when a resource range supports the search interfaces. + * Otherwise, resource.desc must be set to IORES_DESC_NONE (0). + */ +enum { + IORES_DESC_NONE = 0, + IORES_DESC_CRASH_KERNEL = 1, + IORES_DESC_ACPI_TABLES = 2, + IORES_DESC_ACPI_NV_STORAGE = 3, + IORES_DESC_PERSISTENT_MEMORY = 4, + IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, +}; /* helpers to define resources */ #define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ @@ -120,6 +137,7 @@ struct resource { .end = (_start) + (_size) - 1, \ .name = (_name), \ .flags = (_flags), \ + .desc = IORES_DESC_NONE, \ } #define DEFINE_RES_IO_NAMED(_start, _size, _name) \ -- cgit v1.2.3 From 1c29f25bf5d6c557017f619b638c619cbbf798c4 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:28 +0100 Subject: memremap: Change region_intersects() to take @flags and @desc Change region_intersects() to identify a target with @flags and @desc, instead of @name with strcmp(). Change the callers of region_intersects(), memremap() and devm_memremap(), to set IORESOURCE_SYSTEM_RAM in @flags and IORES_DESC_NONE in @desc when searching System RAM. Also, export region_intersects() so that the ACPI EINJ error injection driver can call this function in a later patch. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Acked-by: Dan Williams Cc: Andrew Morton Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jakub Sitnicki Cc: Jan Kara Cc: Jiang Liu Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: Toshi Kani Cc: Vlastimil Babka Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-13-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f1cd22f2df1a..cd5a300d3397 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -385,7 +385,8 @@ enum { REGION_MIXED, }; -int region_intersects(resource_size_t offset, size_t size, const char *type); +int region_intersects(resource_size_t offset, size_t size, unsigned long flags, + unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); -- cgit v1.2.3 From 3f33647c41962401272bb60dce67e6094d14dbf2 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:29 +0100 Subject: resource: Add walk_iomem_res_desc() Add a new interface, walk_iomem_res_desc(), which walks through the iomem table by identifying a target with @flags and @desc. This interface provides the same functionality as walk_iomem_res(), but does not use strcmp() to @name for better efficiency. walk_iomem_res() is deprecated and will be removed in a later patch. Requested-by: Borislav Petkov Signed-off-by: Toshi Kani [ Fixup comments. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Hanjun Guo Cc: Jakub Sitnicki Cc: Jiang Liu Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-14-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/ioport.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 983bea05d69c..2a4a5e839965 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -268,6 +268,9 @@ extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)); extern int +walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)); +extern int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)); -- cgit v1.2.3 From a8fc42530ddd19d7580fe8c9f2ea86220a97e94c Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:32 +0100 Subject: resource: Kill walk_iomem_res() walk_iomem_res_desc() replaced walk_iomem_res() and there is no caller to walk_iomem_res() any more. Kill it. Also remove @name from find_next_iomem_res() as it is no longer used. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Acked-by: Dave Young Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Hanjun Guo Cc: Jakub Sitnicki Cc: Jiang Liu Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: Vinod Koul Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-17-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/ioport.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 2a4a5e839965..afb45597fb5f 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -270,9 +270,6 @@ walk_system_ram_res(u64 start, u64 end, void *arg, extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)); -extern int -walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, - int (*func)(u64, u64, void *)); /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) -- cgit v1.2.3 From cb2517653fccaf9f9b4ae968c7ee005c1bbacdc5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Feb 2016 09:08:36 +0000 Subject: sched/debug: Make schedstats a runtime tunable that is disabled by default schedstats is very useful during debugging and performance tuning but it incurs overhead to calculate the stats. As such, even though it can be disabled at build time, it is often enabled as the information is useful. This patch adds a kernel command-line and sysctl tunable to enable or disable schedstats on demand (when it's built in). It is disabled by default as someone who knows they need it can also learn to enable it when necessary. The benefits are dependent on how scheduler-intensive the workload is. If it is then the patch reduces the number of cycles spent calculating the stats with a small benefit from reducing the cache footprint of the scheduler. These measurements were taken from a 48-core 2-socket machine with Xeon(R) E5-2670 v3 cpus although they were also tested on a single socket machine 8-core machine with Intel i7-3770 processors. netperf-tcp 4.5.0-rc1 4.5.0-rc1 vanilla nostats-v3r1 Hmean 64 560.45 ( 0.00%) 575.98 ( 2.77%) Hmean 128 766.66 ( 0.00%) 795.79 ( 3.80%) Hmean 256 950.51 ( 0.00%) 981.50 ( 3.26%) Hmean 1024 1433.25 ( 0.00%) 1466.51 ( 2.32%) Hmean 2048 2810.54 ( 0.00%) 2879.75 ( 2.46%) Hmean 3312 4618.18 ( 0.00%) 4682.09 ( 1.38%) Hmean 4096 5306.42 ( 0.00%) 5346.39 ( 0.75%) Hmean 8192 10581.44 ( 0.00%) 10698.15 ( 1.10%) Hmean 16384 18857.70 ( 0.00%) 18937.61 ( 0.42%) Small gains here, UDP_STREAM showed nothing intresting and neither did the TCP_RR tests. The gains on the 8-core machine were very similar. tbench4 4.5.0-rc1 4.5.0-rc1 vanilla nostats-v3r1 Hmean mb/sec-1 500.85 ( 0.00%) 522.43 ( 4.31%) Hmean mb/sec-2 984.66 ( 0.00%) 1018.19 ( 3.41%) Hmean mb/sec-4 1827.91 ( 0.00%) 1847.78 ( 1.09%) Hmean mb/sec-8 3561.36 ( 0.00%) 3611.28 ( 1.40%) Hmean mb/sec-16 5824.52 ( 0.00%) 5929.03 ( 1.79%) Hmean mb/sec-32 10943.10 ( 0.00%) 10802.83 ( -1.28%) Hmean mb/sec-64 15950.81 ( 0.00%) 16211.31 ( 1.63%) Hmean mb/sec-128 15302.17 ( 0.00%) 15445.11 ( 0.93%) Hmean mb/sec-256 14866.18 ( 0.00%) 15088.73 ( 1.50%) Hmean mb/sec-512 15223.31 ( 0.00%) 15373.69 ( 0.99%) Hmean mb/sec-1024 14574.25 ( 0.00%) 14598.02 ( 0.16%) Hmean mb/sec-2048 13569.02 ( 0.00%) 13733.86 ( 1.21%) Hmean mb/sec-3072 12865.98 ( 0.00%) 13209.23 ( 2.67%) Small gains of 2-4% at low thread counts and otherwise flat. The gains on the 8-core machine were slightly different tbench4 on 8-core i7-3770 single socket machine Hmean mb/sec-1 442.59 ( 0.00%) 448.73 ( 1.39%) Hmean mb/sec-2 796.68 ( 0.00%) 794.39 ( -0.29%) Hmean mb/sec-4 1322.52 ( 0.00%) 1343.66 ( 1.60%) Hmean mb/sec-8 2611.65 ( 0.00%) 2694.86 ( 3.19%) Hmean mb/sec-16 2537.07 ( 0.00%) 2609.34 ( 2.85%) Hmean mb/sec-32 2506.02 ( 0.00%) 2578.18 ( 2.88%) Hmean mb/sec-64 2511.06 ( 0.00%) 2569.16 ( 2.31%) Hmean mb/sec-128 2313.38 ( 0.00%) 2395.50 ( 3.55%) Hmean mb/sec-256 2110.04 ( 0.00%) 2177.45 ( 3.19%) Hmean mb/sec-512 2072.51 ( 0.00%) 2053.97 ( -0.89%) In constract, this shows a relatively steady 2-3% gain at higher thread counts. Due to the nature of the patch and the type of workload, it's not a surprise that the result will depend on the CPU used. hackbench-pipes 4.5.0-rc1 4.5.0-rc1 vanilla nostats-v3r1 Amean 1 0.0637 ( 0.00%) 0.0660 ( -3.59%) Amean 4 0.1229 ( 0.00%) 0.1181 ( 3.84%) Amean 7 0.1921 ( 0.00%) 0.1911 ( 0.52%) Amean 12 0.3117 ( 0.00%) 0.2923 ( 6.23%) Amean 21 0.4050 ( 0.00%) 0.3899 ( 3.74%) Amean 30 0.4586 ( 0.00%) 0.4433 ( 3.33%) Amean 48 0.5910 ( 0.00%) 0.5694 ( 3.65%) Amean 79 0.8663 ( 0.00%) 0.8626 ( 0.43%) Amean 110 1.1543 ( 0.00%) 1.1517 ( 0.22%) Amean 141 1.4457 ( 0.00%) 1.4290 ( 1.16%) Amean 172 1.7090 ( 0.00%) 1.6924 ( 0.97%) Amean 192 1.9126 ( 0.00%) 1.9089 ( 0.19%) Some small gains and losses and while the variance data is not included, it's close to the noise. The UMA machine did not show anything particularly different pipetest 4.5.0-rc1 4.5.0-rc1 vanilla nostats-v2r2 Min Time 4.13 ( 0.00%) 3.99 ( 3.39%) 1st-qrtle Time 4.38 ( 0.00%) 4.27 ( 2.51%) 2nd-qrtle Time 4.46 ( 0.00%) 4.39 ( 1.57%) 3rd-qrtle Time 4.56 ( 0.00%) 4.51 ( 1.10%) Max-90% Time 4.67 ( 0.00%) 4.60 ( 1.50%) Max-93% Time 4.71 ( 0.00%) 4.65 ( 1.27%) Max-95% Time 4.74 ( 0.00%) 4.71 ( 0.63%) Max-99% Time 4.88 ( 0.00%) 4.79 ( 1.84%) Max Time 4.93 ( 0.00%) 4.83 ( 2.03%) Mean Time 4.48 ( 0.00%) 4.39 ( 1.91%) Best99%Mean Time 4.47 ( 0.00%) 4.39 ( 1.91%) Best95%Mean Time 4.46 ( 0.00%) 4.38 ( 1.93%) Best90%Mean Time 4.45 ( 0.00%) 4.36 ( 1.98%) Best50%Mean Time 4.36 ( 0.00%) 4.25 ( 2.49%) Best10%Mean Time 4.23 ( 0.00%) 4.10 ( 3.13%) Best5%Mean Time 4.19 ( 0.00%) 4.06 ( 3.20%) Best1%Mean Time 4.13 ( 0.00%) 4.00 ( 3.39%) Small improvement and similar gains were seen on the UMA machine. The gain is small but it stands to reason that doing less work in the scheduler is a good thing. The downside is that the lack of schedstats and tracepoints may be surprising to experts doing performance analysis until they find the existence of the schedstats= parameter or schedstats sysctl. It will be automatically activated for latencytop and sleep profiling to alleviate the problem. For tracepoints, there is a simple warning as it's not safe to activate schedstats in the context when it's known the tracepoint may be wanted but is unavailable. Signed-off-by: Mel Gorman Reviewed-by: Matt Fleming Reviewed-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454663316-22048-1-git-send-email-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- include/linux/latencytop.h | 3 +++ include/linux/sched.h | 4 ++++ include/linux/sched/sysctl.h | 4 ++++ 3 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index e23121f9d82a..59ccab297ae0 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_all_latency_tracing(struct task_struct *p); +extern int sysctl_latencytop(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #else static inline void diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..a292c4b7e94c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -920,6 +920,10 @@ static inline int sched_info_on(void) #endif } +#ifdef CONFIG_SCHEDSTATS +void force_schedstat_enabled(void); +#endif + enum cpu_idle_type { CPU_IDLE, CPU_NOT_IDLE, diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..4f080ab4f2cd 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + #endif /* _SCHED_SYSCTL_H */ -- cgit v1.2.3 From a63f38cc4ccfa076f87fc3d0c276ee62e710f953 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 3 Feb 2016 13:44:12 -0800 Subject: locking/lockdep: Convert hash tables to hlists Mike said: : CONFIG_UBSAN_ALIGNMENT breaks x86-64 kernel with lockdep enabled, i.e. : kernel with CONFIG_UBSAN_ALIGNMENT=y fails to load without even any error : message. : : The problem is that ubsan callbacks use spinlocks and might be called : before lockdep is initialized. Particularly this line in the : reserve_ebda_region function causes problem: : : lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); : : If i put lockdep_init() before reserve_ebda_region call in : x86_64_start_reservations kernel loads well. Fix this ordering issue permanently: change lockdep so that it uses hlists for the hash tables. Unlike a list_head, an hlist_head is in its initialized state when it is all-zeroes, so lockdep is ready for operation immediately upon boot - lockdep_init() need not have run. The patch will also save some memory. Probably lockdep_init() and lockdep_initialized can be done away with now. Suggested-by: Mike Krinkin Reported-by: Mike Krinkin Signed-off-by: Andrew Morton Cc: Andrey Ryabinin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: mm-commits@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index c57e424d914b..4dca42fd32f5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -66,7 +66,7 @@ struct lock_class { /* * class-hash: */ - struct list_head hash_entry; + struct hlist_node hash_entry; /* * global list of all lock-classes: @@ -199,7 +199,7 @@ struct lock_chain { u8 irq_context; u8 depth; u16 base; - struct list_head entry; + struct hlist_node entry; u64 chain_key; }; -- cgit v1.2.3 From 06bea3dbfe6a4c333c4333362c46bdf4d9e43504 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 4 Feb 2016 11:29:36 -0800 Subject: locking/lockdep: Eliminate lockdep_init() Lockdep is initialized at compile time now. Get rid of lockdep_init(). Signed-off-by: Andrey Ryabinin Signed-off-by: Andrew Morton Cc: Linus Torvalds Cc: Mike Krinkin Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: mm-commits@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 4dca42fd32f5..d026b190c530 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -261,7 +261,6 @@ struct held_lock { /* * Initialization, self-test and debugging-output methods: */ -extern void lockdep_init(void); extern void lockdep_info(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); @@ -392,7 +391,6 @@ static inline void lockdep_on(void) # define lockdep_set_current_reclaim_state(g) do { } while (0) # define lockdep_clear_current_reclaim_state() do { } while (0) # define lockdep_trace_alloc(g) do { } while (0) -# define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) -- cgit v1.2.3 From fed0764fafd8e2e629a033c0f7df4106b0dcb7f0 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 25 Jan 2016 16:33:20 -0500 Subject: locking/atomics: Update comment about READ_ONCE() and structures The comment is out of data. Also point out the performance drawback of the barrier();__builtin_memcpy(); barrier() followed by another copy from stack (__u) to lvalue; Signed-off-by: Konrad Rzeszutek Wilk Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453757600-11441-1-git-send-email-konrad.wilk@oracle.com [ Made it a bit more readable. ] Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 00b042c49ccd..4291592b6433 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -263,8 +263,9 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s * In contrast to ACCESS_ONCE these two macros will also work on aggregate * data types like structs or unions. If the size of the accessed data * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) - * READ_ONCE() and WRITE_ONCE() will fall back to memcpy and print a - * compile-time warning. + * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at + * least two memcpy()s: one for the __builtin_memcpy() and then one for + * the macro doing the copy of variable - '__u' allocated on the stack. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, -- cgit v1.2.3 From 287e6611ab1eac76c2c5ebf6e345e04c80ca9c61 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 Feb 2016 14:16:27 +0100 Subject: libata: fix HDIO_GET_32BIT ioctl As reported by Soohoon Lee, the HDIO_GET_32BIT ioctl does not work correctly in compat mode with libata. I have investigated the issue further and found multiple problems that all appeared with the same commit that originally introduced HDIO_GET_32BIT handling in libata back in linux-2.6.8 and presumably also linux-2.4, as the code uses "copy_to_user(arg, &val, 1)" to copy a 'long' variable containing either 0 or 1 to user space. The problems with this are: * On big-endian machines, this will always write a zero because it stores the wrong byte into user space. * In compat mode, the upper three bytes of the variable are updated by the compat_hdio_ioctl() function, but they now contain uninitialized stack data. * The hdparm tool calling this ioctl uses a 'static long' variable to store the result. This means at least the upper bytes are initialized to zero, but calling another ioctl like HDIO_GET_MULTCOUNT would fill them with data that remains stale when the low byte is overwritten. Fortunately libata doesn't implement any of the affected ioctl commands, so this would only happen when we query both an IDE and an ATA device in the same command such as "hdparm -N -c /dev/hda /dev/sda" * The libata code for unknown reasons started using ATA_IOC_GET_IO32 and ATA_IOC_SET_IO32 as aliases for HDIO_GET_32BIT and HDIO_SET_32BIT, while the ioctl commands that were added later use the normal HDIO_* names. This is harmless but rather confusing. This addresses all four issues by changing the code to use put_user() on an 'unsigned long' variable in HDIO_GET_32BIT, like the IDE subsystem does, and by clarifying the names of the ioctl commands. Signed-off-by: Arnd Bergmann Reported-by: Soohoon Lee Tested-by: Soohoon Lee Cc: stable@vger.kernel.org Signed-off-by: Tejun Heo --- include/linux/ata.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ata.h b/include/linux/ata.h index d2992bfa1706..c1a2f345cbe6 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -487,8 +487,8 @@ enum ata_tf_protocols { }; enum ata_ioctls { - ATA_IOC_GET_IO32 = 0x309, - ATA_IOC_SET_IO32 = 0x324, + ATA_IOC_GET_IO32 = 0x309, /* HDIO_GET_32BIT */ + ATA_IOC_SET_IO32 = 0x324, /* HDIO_SET_32BIT */ }; /* core structures */ -- cgit v1.2.3 From 5fd7a09cfb8c6852f596c1f8c891c6158395250e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Aug 2015 18:03:23 +0200 Subject: atomic: Export fetch_or() Export fetch_or() that's implemented and used internally by the scheduler. We are going to use it for NO_HZ so make it generally available. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/atomic.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 301de78d65f7..6c502cb13c95 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -548,6 +548,27 @@ static inline int atomic_dec_if_positive(atomic_t *v) } #endif +/** + * fetch_or - perform *ptr |= mask and return old value of *ptr + * @ptr: pointer to value + * @mask: mask to OR on the value + * + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#ifndef fetch_or +#define fetch_or(ptr, mask) \ +({ typeof(*(ptr)) __old, __val = *(ptr); \ + for (;;) { \ + __old = cmpxchg((ptr), __val, __val | (mask)); \ + if (__old == __val) \ + break; \ + __val = __old; \ + } \ + __old; \ +}) +#endif + + #ifdef CONFIG_GENERIC_ATOMIC64 #include #endif -- cgit v1.2.3 From 1f4522400e7e49464da5e584a463bd315283790f Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 11 Feb 2016 12:28:55 -0200 Subject: [media] [for,v4.5] media.h: increase the spacing between function ranges Each function range is quite narrow and especially for connectors this will pose a problem. Increase the function ranges while we still can and move the connector range to the end so that range is practically limitless. [mchehab@osg.samsung.com: Rebased to apply at Linus tree] Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/media.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h index 1e3c8cb43bd7..8b87bddecf1c 100644 --- a/include/uapi/linux/media.h +++ b/include/uapi/linux/media.h @@ -72,21 +72,21 @@ struct media_device_info { #define MEDIA_ENT_F_DTV_NET_DECAP (MEDIA_ENT_F_BASE + 4) /* - * Connectors + * I/O entities */ -/* It is a responsibility of the entity drivers to add connectors and links */ -#define MEDIA_ENT_F_CONN_RF (MEDIA_ENT_F_BASE + 21) -#define MEDIA_ENT_F_CONN_SVIDEO (MEDIA_ENT_F_BASE + 22) -#define MEDIA_ENT_F_CONN_COMPOSITE (MEDIA_ENT_F_BASE + 23) -/* For internal test signal generators and other debug connectors */ -#define MEDIA_ENT_F_CONN_TEST (MEDIA_ENT_F_BASE + 24) +#define MEDIA_ENT_F_IO_DTV (MEDIA_ENT_F_BASE + 1001) +#define MEDIA_ENT_F_IO_VBI (MEDIA_ENT_F_BASE + 1002) +#define MEDIA_ENT_F_IO_SWRADIO (MEDIA_ENT_F_BASE + 1003) /* - * I/O entities + * Connectors */ -#define MEDIA_ENT_F_IO_DTV (MEDIA_ENT_F_BASE + 31) -#define MEDIA_ENT_F_IO_VBI (MEDIA_ENT_F_BASE + 32) -#define MEDIA_ENT_F_IO_SWRADIO (MEDIA_ENT_F_BASE + 33) +/* It is a responsibility of the entity drivers to add connectors and links */ +#define MEDIA_ENT_F_CONN_RF (MEDIA_ENT_F_BASE + 10001) +#define MEDIA_ENT_F_CONN_SVIDEO (MEDIA_ENT_F_BASE + 10002) +#define MEDIA_ENT_F_CONN_COMPOSITE (MEDIA_ENT_F_BASE + 10003) +/* For internal test signal generators and other debug connectors */ +#define MEDIA_ENT_F_CONN_TEST (MEDIA_ENT_F_BASE + 10004) /* * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and -- cgit v1.2.3 From 9727a9545adec59f4bccb83d1a709711f4acf242 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 12 Feb 2016 15:44:31 -0200 Subject: [media] media.h: get rid of MEDIA_ENT_F_CONN_TEST Defining it as a connector was a bad idea. Remove it while it is not too late. Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/media.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h index 8b87bddecf1c..f0328524be6e 100644 --- a/include/uapi/linux/media.h +++ b/include/uapi/linux/media.h @@ -85,8 +85,6 @@ struct media_device_info { #define MEDIA_ENT_F_CONN_RF (MEDIA_ENT_F_BASE + 10001) #define MEDIA_ENT_F_CONN_SVIDEO (MEDIA_ENT_F_BASE + 10002) #define MEDIA_ENT_F_CONN_COMPOSITE (MEDIA_ENT_F_BASE + 10003) -/* For internal test signal generators and other debug connectors */ -#define MEDIA_ENT_F_CONN_TEST (MEDIA_ENT_F_BASE + 10004) /* * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and -- cgit v1.2.3 From e267d97b83d9cecc16c54825f9f3ac7f72dc1e1e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:12 -0800 Subject: asm-generic: Consolidate mark_rodata_ro() Instead of defining mark_rodata_ro() in each architecture, consolidate it. Signed-off-by: Kees Cook Acked-by: Will Deacon Cc: Andrew Morton Cc: Andy Gross Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Ashok Kumar Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: Dan Williams Cc: David Brown Cc: David Hildenbrand Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Helge Deller Cc: James E.J. Bottomley Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Marc Zyngier Cc: Mark Rutland Cc: Mathias Krause Cc: Michael Ellerman Cc: Nicolas Pitre Cc: PaX Team Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ross Zwisler Cc: Russell King Cc: Rusty Russell Cc: Stephen Boyd Cc: Thomas Gleixner Cc: Toshi Kani Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-parisc@vger.kernel.org Link: http://lkml.kernel.org/r/1455748879-21872-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- include/linux/init.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/init.h b/include/linux/init.h index b449f378f995..aedb254abc37 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -142,6 +142,10 @@ void prepare_namespace(void); void __init load_default_modules(void); int __init init_rootfs(void); +#ifdef CONFIG_DEBUG_RODATA +void mark_rodata_ro(void); +#endif + extern void (*late_time_init)(void); extern bool initcall_debug; -- cgit v1.2.3 From c74ba8b3480da6ddaea17df2263ec09b869ac496 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:15 -0800 Subject: arch: Introduce post-init read-only memory One of the easiest ways to protect the kernel from attack is to reduce the internal attack surface exposed when a "write" flaw is available. By making as much of the kernel read-only as possible, we reduce the attack surface. Many things are written to only during __init, and never changed again. These cannot be made "const" since the compiler will do the wrong thing (we do actually need to write to them). Instead, move these items into a memory region that will be made read-only during mark_rodata_ro() which happens after all kernel __init code has finished. This introduces __ro_after_init as a way to mark such memory, and adds some documentation about the existing __read_mostly marking. This improves the security of the Linux kernel by marking formerly read-write memory regions as read-only on a fully booted up system. Based on work by PaX Team and Brad Spengler. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brad Spengler Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 1 + include/linux/cache.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c4bd0e2c173c..772c784ba763 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -256,6 +256,7 @@ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_rodata) = .; \ *(.rodata) *(.rodata.*) \ + *(.data..ro_after_init) /* Read only after init */ \ *(__vermagic) /* Kernel version magic */ \ . = ALIGN(8); \ VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .; \ diff --git a/include/linux/cache.h b/include/linux/cache.h index 17e7e82d2aa7..1be04f8c563a 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -12,10 +12,24 @@ #define SMP_CACHE_BYTES L1_CACHE_BYTES #endif +/* + * __read_mostly is used to keep rarely changing variables out of frequently + * updated cachelines. If an architecture doesn't support it, ignore the + * hint. + */ #ifndef __read_mostly #define __read_mostly #endif +/* + * __ro_after_init is used to mark things that are read-only after init (i.e. + * after mark_rodata_ro() has been called). These are effectively read-only, + * but may get written to during init, so can't live in .rodata (via "const"). + */ +#ifndef __ro_after_init +#define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) +#endif + #ifndef ____cacheline_aligned #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) #endif -- cgit v1.2.3 From ad315455d396a1cbcb2f9fdd687b7e1b26b789e7 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 29 Dec 2015 12:18:46 +0800 Subject: sparse: Add __private to privatize members of structs In C programming language, we don't have a easy way to privatize a member of a structure. However in kernel, sometimes there is a need to privatize a member in case of potential bugs or misuses. Fortunately, the noderef attribute of sparse is a way to privatize a member, as by defining a member as noderef, the address-of operator on the member will produce a noderef pointer to that member, and if anyone wants to dereference that kind of pointers to read or modify the member, sparse will yell. Based on this, __private modifier and related operation ACCESS_PRIVATE() are introduced, which could help detect undesigned public uses of private members of structs. Here is an example of sparse's output if it detect an undersigned public use: | kernel/rcu/tree.c:4453:25: warning: incorrect type in argument 1 (different modifiers) | kernel/rcu/tree.c:4453:25: expected struct raw_spinlock [usertype] *lock | kernel/rcu/tree.c:4453:25: got struct raw_spinlock [noderef] * Also, this patch improves compiler.h a little bit by adding comments for "#else" and "#endif". Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- include/linux/compiler.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 00b042c49ccd..c845356952bb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -20,12 +20,14 @@ # define __pmem __attribute__((noderef, address_space(5))) #ifdef CONFIG_SPARSE_RCU_POINTER # define __rcu __attribute__((noderef, address_space(4))) -#else +#else /* CONFIG_SPARSE_RCU_POINTER */ # define __rcu -#endif +#endif /* CONFIG_SPARSE_RCU_POINTER */ +# define __private __attribute__((noderef)) extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); -#else +# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member)) +#else /* __CHECKER__ */ # define __user # define __kernel # define __safe @@ -44,7 +46,9 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __percpu # define __rcu # define __pmem -#endif +# define __private +# define ACCESS_PRIVATE(p, member) ((p)->member) +#endif /* __CHECKER__ */ /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ #define ___PASTE(a,b) a##b -- cgit v1.2.3 From b354286effa52da6cb1b1f16604d41ff81b8c445 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 29 Dec 2015 12:18:48 +0800 Subject: irq: Privatize irq_common_data::state_use_accessors irq_common_data::state_use_accessors is not designed for public use. Therefore make it private so that people who write code accessing it directly will get blamed by sparse. Also #undef the macro __irqd_to_state after used in header files, so that the macro can't be misused. Signed-off-by: Boqun Feng Reviewed-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- include/linux/irq.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 3c1c96786248..cd14cd4a22b4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -137,7 +137,7 @@ struct irq_domain; * @msi_desc: MSI descriptor */ struct irq_common_data { - unsigned int state_use_accessors; + unsigned int __private state_use_accessors; #ifdef CONFIG_NUMA unsigned int node; #endif @@ -208,7 +208,7 @@ enum { IRQD_FORWARDED_TO_VCPU = (1 << 20), }; -#define __irqd_to_state(d) ((d)->common->state_use_accessors) +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { @@ -299,6 +299,8 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } +#undef __irqd_to_state + static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; -- cgit v1.2.3 From 9de630c4f264dec48e61edd871cb98b8e1b58250 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Jan 2016 07:43:50 -0800 Subject: rcu: Document unique-name limitation for DEFINE_STATIC_SRCU() SRCU uses per-CPU variables, and DEFINE_STATIC_SRCU() uses a static per-CPU variable. However, per-CPU variables have significant restrictions, for example, names of per-CPU variables must be globally unique, even if declared static. These restrictions carry over to DEFINE_STATIC_SRCU(), and this commit therefore documents these restrictions. Reported-by: Stephen Rothwell Reported-by: kbuild test robot Suggested-by: Boqun Feng Signed-off-by: Paul E. McKenney Reviewed-by: Tejun Heo --- include/linux/srcu.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f5f80c5643ac..dc8eb63c6568 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -99,8 +99,23 @@ void process_srcu(struct work_struct *work); } /* - * define and init a srcu struct at build time. - * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it. + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #define __DEFINE_SRCU(name, is_static) \ static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ -- cgit v1.2.3 From 3500efae4410454522697c94c23fc40323c0cee9 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 19 Oct 2015 14:45:02 -0700 Subject: rcu: Remove rcu_user_hooks_switch Because there are neither uses nor intended uses for the rcu_user_hooks_switch() function that was orginally intended for nohz use, this commit removes it. Signed-off-by: Yang Shi Acked-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 14e6f47ee16f..b5d48bd56e3f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -360,8 +360,6 @@ void rcu_user_exit(void); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } -static inline void rcu_user_hooks_switch(struct task_struct *prev, - struct task_struct *next) { } #endif /* CONFIG_NO_HZ_FULL */ #ifdef CONFIG_RCU_NOCB_CPU -- cgit v1.2.3 From f4bcfa1da6fdcbc7a0854a28603bffc3c5656332 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 23 Feb 2016 22:39:28 +0900 Subject: sched/wait: Fix wait_event_freezable() documentation I noticed the comment label 'wait_event' was wrong. Signed-off-by: Stafford Horne Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1456234768-24933-1-git-send-email-shorne@gmail.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index ae71a769b89e..27d7a0ab5da3 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -338,7 +338,7 @@ do { \ schedule(); try_to_freeze()) /** - * wait_event - sleep (or freeze) until a condition gets true + * wait_event_freezable - sleep (or freeze) until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * -- cgit v1.2.3 From 2da897e51d7f23f872224218b9a4314503b0d360 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Feb 2016 02:05:26 +0100 Subject: bpf: fix csum setting for bpf_set_tunnel_key The fix in 35e2d1152b22 ("tunnels: Allow IPv6 UDP checksums to be correctly controlled.") changed behavior for bpf_set_tunnel_key() when in use with IPv6 and thus uncovered a bug that TUNNEL_CSUM needed to be set but wasn't. As a result, the stack dropped ingress vxlan IPv6 packets, that have been sent via eBPF through collect meta data mode due to checksum now being zero. Since after LCO, we enable IPv4 checksum by default, so make that analogous and only provide a flag BPF_F_ZERO_CSUM_TX for the user to turn it off in IPv4 case. Fixes: 35e2d1152b22 ("tunnels: Allow IPv6 UDP checksums to be correctly controlled.") Fixes: c6c33454072f ("bpf: support ipv6 for bpf_skb_{set,get}_tunnel_key") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa6f8571de13..5df4881dea7b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -292,6 +292,9 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) +/* BPF_FUNC_skb_set_tunnel_key flags. */ +#define BPF_F_ZERO_CSUM_TX (1ULL << 1) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ -- cgit v1.2.3 From 0abefbaab4edbcec637e00fefcdeccb52797fe4f Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:12 +0000 Subject: genirq: Add new IPI irqdomain flags These flags will be used to identify an IPI domain. We have two flavours of IPI implementations: IRQ_DOMAIN_FLAG_IPI_PER_CPU: Each CPU has its own virq and hwirq IRQ_DOMAIN_FLAG_IPI_SINGLE : A single virq and hwirq for all CPUs Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-2-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 04579d9fbce4..9bb0a9cfc1c4 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -172,6 +172,12 @@ enum { /* Core calls alloc/free recursive through the domain hierarchy. */ IRQ_DOMAIN_FLAG_AUTO_RECURSIVE = (1 << 1), + /* Irq domain is an IPI domain with virq per cpu */ + IRQ_DOMAIN_FLAG_IPI_PER_CPU = (1 << 2), + + /* Irq domain is an IPI domain with single virq */ + IRQ_DOMAIN_FLAG_IPI_SINGLE = (1 << 3), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -400,6 +406,22 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY; } + +static inline bool irq_domain_is_ipi(struct irq_domain *domain) +{ + return domain->flags & + (IRQ_DOMAIN_FLAG_IPI_PER_CPU | IRQ_DOMAIN_FLAG_IPI_SINGLE); +} + +static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_IPI_PER_CPU; +} + +static inline bool irq_domain_is_ipi_single(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_IPI_SINGLE; +} #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline void irq_domain_activate_irq(struct irq_data *data) { } static inline void irq_domain_deactivate_irq(struct irq_data *data) { } @@ -413,6 +435,21 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return false; } + +static inline bool irq_domain_is_ipi(struct irq_domain *domain) +{ + return false; +} + +static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain) +{ + return false; +} + +static inline bool irq_domain_is_ipi_single(struct irq_domain *domain) +{ + return false; +} #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #else /* CONFIG_IRQ_DOMAIN */ -- cgit v1.2.3 From 29d5c8db26ad54592436508819ac617119306f96 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:13 +0000 Subject: genirq: Add DOMAIN_BUS_IPI We need a way to search and match IPI domains. Using the new enum we can use irq_find_matching_host() to do that. Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-3-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 9bb0a9cfc1c4..130e1c3117c3 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -74,6 +74,7 @@ enum irq_domain_bus_token { DOMAIN_BUS_PCI_MSI, DOMAIN_BUS_PLATFORM_MSI, DOMAIN_BUS_NEXUS, + DOMAIN_BUS_IPI, }; /** -- cgit v1.2.3 From 955bfe5912e7839abcc83694f06867535487404b Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:17 +0000 Subject: genirq: Add an extra comment about the use of affinity in irq_common_data Affinity will have dual meaning depends on the type of the irq. If it is a normal irq, it'll have the standard affinity meaning. If it is an IPI, it will hold the mask of the cpus to which an IPI can be sent. Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-7-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 3c1c96786248..0817afd0d719 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -133,7 +133,9 @@ struct irq_domain; * Use accessor functions to deal with it * @node: node index useful for balancing * @handler_data: per-IRQ data for the irq_chip methods - * @affinity: IRQ affinity on SMP + * @affinity: IRQ affinity on SMP. If this is an IPI + * related irq, then this is the mask of the + * CPUs to which an IPI can be sent. * @msi_desc: MSI descriptor */ struct irq_common_data { -- cgit v1.2.3 From f256c9a0c54820ffef21b126f8226be2bece3dd7 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:16 +0000 Subject: genirq: Add ipi_offset to irq_common_data IPIs are always assumed to be consecutively allocated, hence virqs and hwirqs can be inferred by using CPU id as an offset. But the first cpu doesn't always have to start at offset 0. ipi_offset stores the position of the first cpu so that we can easily calculate the virq or hwirq of an IPI associated with a specific cpu. Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-6-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 0817afd0d719..a32b47fbf874 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -137,6 +137,7 @@ struct irq_domain; * related irq, then this is the mask of the * CPUs to which an IPI can be sent. * @msi_desc: MSI descriptor + * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional. */ struct irq_common_data { unsigned int state_use_accessors; @@ -146,6 +147,9 @@ struct irq_common_data { void *handler_data; struct msi_desc *msi_desc; cpumask_var_t affinity; +#ifdef CONFIG_GENERIC_IRQ_IPI + unsigned int ipi_offset; +#endif }; /** -- cgit v1.2.3 From ac0a0cd266d1f21236d5975ca6bced9b377a2a6a Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:18 +0000 Subject: genirq: Make irq_domain_alloc_descs() non static We will need to use this function to implement irq_reserve_ipi() later. So make it non static and move the prototype to irqdomain.h to allow using it outside irqdomain.c Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-8-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 130e1c3117c3..c466cc17b8e9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -213,6 +213,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token); extern void irq_set_default_host(struct irq_domain *host); +extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { -- cgit v1.2.3 From d17bf24e695290d3fe7943aca52ab48098a10653 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:19 +0000 Subject: genirq: Add a new generic IPI reservation code to irq core Add a generic mechanism to dynamically allocate an IPI. Depending on the underlying implementation this creates either a single Linux irq or a consective range of Linux irqs. The Linux irq is used later to send IPIs to other CPUs. [ tglx: Massaged the code and removed the 'consecutive mask' restriction for the single IRQ case ] Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-9-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 3 +++ include/linux/irqdomain.h | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index a32b47fbf874..95f4f66f95f3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -940,4 +940,7 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, return readl(gc->reg_base + reg_offset); } +/* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ +#define INVALID_HWIRQ (~0UL) + #endif /* _LINUX_IRQ_H */ diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index c466cc17b8e9..ed48594e96d2 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -344,6 +344,11 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type); +/* IPI functions */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest); +void irq_destroy_ipi(unsigned int irq); + /* V2 interfaces to support hierarchy IRQ domains. */ extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq); -- cgit v1.2.3 From f9bce791ae2a1a10a965b30427f5507c1a77669f Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:20 +0000 Subject: genirq: Add a new function to get IPI reverse mapping When dealing with coprocessors we need to find out the actual hwirqs values to pass on to the firmware so that it knows what it needs to use to receive IPIs from and send IPIs to Linux cpus. [ tglx: Fixed the single hwirq IPI case. The hardware irq number does not change due to the cpu number ] Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-10-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 95f4f66f95f3..10273dce058a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -942,5 +942,6 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); #endif /* _LINUX_IRQ_H */ -- cgit v1.2.3 From 34dc1ae101018dbb50e1d04e88aa89052802a7db Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:21 +0000 Subject: genirq: Add send_ipi callbacks to irq_chip Introduce the new callbacks which can be used by the core code to implement a generic IPI send mechanism. Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-11-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 10273dce058a..3b3a5b817469 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -347,6 +347,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine + * @ipi_send_single: send a single IPI to destination cpus + * @ipi_send_mask: send an IPI to destination cpus in cpumask * @flags: chip specific flags */ struct irq_chip { @@ -391,6 +393,9 @@ struct irq_chip { int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); + void (*ipi_send_single)(struct irq_data *data, unsigned int cpu); + void (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest); + unsigned long flags; }; -- cgit v1.2.3 From 3b8e29a82dd16c1f2061e0b955a71cd36eeb061b Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:22 +0000 Subject: genirq: Implement ipi_send_mask/single() Add APIs to send IPIs from driver and arch code. We have different functions because we allow architecture code to cache the irq descriptor to avoid lookups. Driver code has to use the irq number and is subject to more restrictive checks. [ tglx: Polish the implementation ] Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-12-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 3b3a5b817469..d5ebd94822d2 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -948,5 +948,9 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu); +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); +int ipi_send_single(unsigned int virq, unsigned int cpu); +int ipi_send_mask(unsigned int virq, const struct cpumask *dest); #endif /* _LINUX_IRQ_H */ -- cgit v1.2.3 From bb11cff327e54179c13446c4022ed4ed7d4871c7 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:28 +0000 Subject: MIPS: Make smp CMP, CPS and MT use the new generic IPI functions This commit does several things to avoid breaking bisectability. 1- Remove IPI init code from irqchip/mips-gic 2- Implement the new irqchip->send_ipi() in irqchip/mips-gic 3- Select GENERIC_IRQ_IPI Kconfig symbol for MIPS_GIC 4- Change MIPS SMP to use the generic IPI implementation Only the SMP variants that use GIC were converted as it's the only irqchip that will have the support for generic IPI for now. Signed-off-by: Qais Yousef Acked-by: Ralf Baechle Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-18-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irqchip/mips-gic.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h index ce824db48d64..80f89e4a29ac 100644 --- a/include/linux/irqchip/mips-gic.h +++ b/include/linux/irqchip/mips-gic.h @@ -261,9 +261,6 @@ extern void gic_write_compare(cycle_t cnt); extern void gic_write_cpu_compare(cycle_t cnt, int cpu); extern void gic_start_count(void); extern void gic_stop_count(void); -extern void gic_send_ipi(unsigned int intr); -extern unsigned int plat_ipi_call_int_xlate(unsigned int); -extern unsigned int plat_ipi_resched_int_xlate(unsigned int); extern int gic_get_c0_compare_int(void); extern int gic_get_c0_perfcount_int(void); extern int gic_get_c0_fdc_int(void); -- cgit v1.2.3 From 13b35686e8b934ff78f59cef0c65fa3a43f8eeaf Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Fri, 19 Feb 2016 09:46:37 +0100 Subject: wait.[ch]: Introduce the simple waitqueue (swait) implementation The existing wait queue support has support for custom wake up call backs, wake flags, wake key (passed to call back) and exclusive flags that allow wakers to be tagged as exclusive, for limiting the number of wakers. In a lot of cases, none of these features are used, and hence we can benefit from a slimmed down version that lowers memory overhead and reduces runtime overhead. The concept originated from -rt, where waitqueues are a constant source of trouble, as we can't convert the head lock to a raw spinlock due to fancy and long lasting callbacks. With the removal of custom callbacks, we can use a raw lock for queue list manipulations, hence allowing the simple wait support to be used in -rt. [Patch is from PeterZ which is based on Thomas version. Commit message is written by Paul G. Daniel: - Fixed some compile issues - Added non-lazy implementation of swake_up_locked as suggested by Boqun Feng.] Originally-by: Thomas Gleixner Signed-off-by: Daniel Wagner Acked-by: Peter Zijlstra (Intel) Cc: linux-rt-users@vger.kernel.org Cc: Boqun Feng Cc: Marcelo Tosatti Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Paolo Bonzini Cc: "Paul E. McKenney" Link: http://lkml.kernel.org/r/1455871601-27484-2-git-send-email-wagi@monom.org Signed-off-by: Thomas Gleixner --- include/linux/swait.h | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 include/linux/swait.h (limited to 'include') diff --git a/include/linux/swait.h b/include/linux/swait.h new file mode 100644 index 000000000000..c1f9c62a8a50 --- /dev/null +++ b/include/linux/swait.h @@ -0,0 +1,172 @@ +#ifndef _LINUX_SWAIT_H +#define _LINUX_SWAIT_H + +#include +#include +#include +#include + +/* + * Simple wait queues + * + * While these are very similar to the other/complex wait queues (wait.h) the + * most important difference is that the simple waitqueue allows for + * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold + * times. + * + * In order to make this so, we had to drop a fair number of features of the + * other waitqueue code; notably: + * + * - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue; + * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right + * sleeper state. + * + * - the exclusive mode; because this requires preserving the list order + * and this is hard. + * + * - custom wake functions; because you cannot give any guarantees about + * random code. + * + * As a side effect of this; the data structures are slimmer. + * + * One would recommend using this wait queue where possible. + */ + +struct task_struct; + +struct swait_queue_head { + raw_spinlock_t lock; + struct list_head task_list; +}; + +struct swait_queue { + struct task_struct *task; + struct list_head task_list; +}; + +#define __SWAITQUEUE_INITIALIZER(name) { \ + .task = current, \ + .task_list = LIST_HEAD_INIT((name).task_list), \ +} + +#define DECLARE_SWAITQUEUE(name) \ + struct swait_queue name = __SWAITQUEUE_INITIALIZER(name) + +#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ + .task_list = LIST_HEAD_INIT((name).task_list), \ +} + +#define DECLARE_SWAIT_QUEUE_HEAD(name) \ + struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name) + +extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name, + struct lock_class_key *key); + +#define init_swait_queue_head(q) \ + do { \ + static struct lock_class_key __key; \ + __init_swait_queue_head((q), #q, &__key); \ + } while (0) + +#ifdef CONFIG_LOCKDEP +# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ + ({ init_swait_queue_head(&name); name; }) +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ + struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) +#else +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ + DECLARE_SWAIT_QUEUE_HEAD(name) +#endif + +static inline int swait_active(struct swait_queue_head *q) +{ + return !list_empty(&q->task_list); +} + +extern void swake_up(struct swait_queue_head *q); +extern void swake_up_all(struct swait_queue_head *q); +extern void swake_up_locked(struct swait_queue_head *q); + +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); +extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); + +extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); + +/* as per ___wait_event() but for swait, therefore "exclusive == 0" */ +#define ___swait_event(wq, condition, state, ret, cmd) \ +({ \ + struct swait_queue __wait; \ + long __ret = ret; \ + \ + INIT_LIST_HEAD(&__wait.task_list); \ + for (;;) { \ + long __int = prepare_to_swait_event(&wq, &__wait, state);\ + \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + break; \ + } \ + \ + cmd; \ + } \ + finish_swait(&wq, &__wait); \ + __ret; \ +}) + +#define __swait_event(wq, condition) \ + (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ + schedule()) + +#define swait_event(wq, condition) \ +do { \ + if (condition) \ + break; \ + __swait_event(wq, condition); \ +} while (0) + +#define __swait_event_timeout(wq, condition, timeout) \ + ___swait_event(wq, ___wait_cond_timeout(condition), \ + TASK_UNINTERRUPTIBLE, timeout, \ + __ret = schedule_timeout(__ret)) + +#define swait_event_timeout(wq, condition, timeout) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout(condition)) \ + __ret = __swait_event_timeout(wq, condition, timeout); \ + __ret; \ +}) + +#define __swait_event_interruptible(wq, condition) \ + ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ + schedule()) + +#define swait_event_interruptible(wq, condition) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __ret = __swait_event_interruptible(wq, condition); \ + __ret; \ +}) + +#define __swait_event_interruptible_timeout(wq, condition, timeout) \ + ___swait_event(wq, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, timeout, \ + __ret = schedule_timeout(__ret)) + +#define swait_event_interruptible_timeout(wq, condition, timeout) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout(condition)) \ + __ret = __swait_event_interruptible_timeout(wq, \ + condition, timeout); \ + __ret; \ +}) + +#endif /* _LINUX_SWAIT_H */ -- cgit v1.2.3 From 8577370fb0cbe88266b7583d8d3b9f43ced077a0 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 19 Feb 2016 09:46:39 +0100 Subject: KVM: Use simple waitqueue for vcpu->wq The problem: On -rt, an emulated LAPIC timer instances has the following path: 1) hard interrupt 2) ksoftirqd is scheduled 3) ksoftirqd wakes up vcpu thread 4) vcpu thread is scheduled This extra context switch introduces unnecessary latency in the LAPIC path for a KVM guest. The solution: Allow waking up vcpu thread from hardirq context, thus avoiding the need for ksoftirqd to be scheduled. Normal waitqueues make use of spinlocks, which on -RT are sleepable locks. Therefore, waking up a waitqueue waiter involves locking a sleeping lock, which is not allowed from hard interrupt context. cyclictest command line: This patch reduces the average latency in my tests from 14us to 11us. Daniel writes: Paolo asked for numbers from kvm-unit-tests/tscdeadline_latency benchmark on mainline. The test was run 1000 times on tip/sched/core 4.4.0-rc8-01134-g0905f04: ./x86-run x86/tscdeadline_latency.flat -cpu host with idle=poll. The test seems not to deliver really stable numbers though most of them are smaller. Paolo write: "Anything above ~10000 cycles means that the host went to C1 or lower---the number means more or less nothing in that case. The mean shows an improvement indeed." Before: min max mean std count 1000.000000 1000.000000 1000.000000 1000.000000 mean 5162.596000 2019270.084000 5824.491541 20681.645558 std 75.431231 622607.723969 89.575700 6492.272062 min 4466.000000 23928.000000 5537.926500 585.864966 25% 5163.000000 1613252.750000 5790.132275 16683.745433 50% 5175.000000 2281919.000000 5834.654000 23151.990026 75% 5190.000000 2382865.750000 5861.412950 24148.206168 max 5228.000000 4175158.000000 6254.827300 46481.048691 After min max mean std count 1000.000000 1000.00000 1000.000000 1000.000000 mean 5143.511000 2076886.10300 5813.312474 21207.357565 std 77.668322 610413.09583 86.541500 6331.915127 min 4427.000000 25103.00000 5529.756600 559.187707 25% 5148.000000 1691272.75000 5784.889825 17473.518244 50% 5160.000000 2308328.50000 5832.025000 23464.837068 75% 5172.000000 2393037.75000 5853.177675 24223.969976 max 5222.000000 3922458.00000 6186.720500 42520.379830 [Patch was originaly based on the swait implementation found in the -rt tree. Daniel ported it to mainline's version and gathered the benchmark numbers for tscdeadline_latency test.] Signed-off-by: Daniel Wagner Acked-by: Peter Zijlstra (Intel) Cc: linux-rt-users@vger.kernel.org Cc: Boqun Feng Cc: Marcelo Tosatti Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Paolo Bonzini Cc: "Paul E. McKenney" Link: http://lkml.kernel.org/r/1455871601-27484-4-git-send-email-wagi@monom.org Signed-off-by: Thomas Gleixner --- include/linux/kvm_host.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 861f690aa791..5276fe0916fc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -218,7 +219,7 @@ struct kvm_vcpu { int fpu_active; int guest_fpu_loaded, guest_xcr0_loaded; unsigned char fpu_counter; - wait_queue_head_t wq; + struct swait_queue_head wq; struct pid *pid; int sigset_active; sigset_t sigset; @@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm) } #endif -static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) +static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_WQP return vcpu->arch.wqp; -- cgit v1.2.3 From 4ee34ea3a12396f35b26d90a094c75db95080baa Mon Sep 17 00:00:00 2001 From: Harvey Hunt Date: Wed, 24 Feb 2016 15:16:43 +0000 Subject: libata: Align ata_device's id on a cacheline The id buffer in ata_device is a DMA target, but it isn't explicitly cacheline aligned. Due to this, adjacent fields can be overwritten with stale data from memory on non coherent architectures. As a result, the kernel is sometimes unable to communicate with an ATA device. Fix this by ensuring that the id buffer is cacheline aligned. This issue is similar to that fixed by Commit 84bda12af31f ("libata: align ap->sector_buf"). Signed-off-by: Harvey Hunt Cc: linux-kernel@vger.kernel.org Cc: # 2.6.18 Signed-off-by: Tejun Heo --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index bec2abbd7ab2..2c4ebef79d0c 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -720,7 +720,7 @@ struct ata_device { union { u16 id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */ u32 gscr[SATA_PMP_GSCR_DWORDS]; /* PMP GSCR block */ - }; + } ____cacheline_aligned; /* DEVSLP Timing Variables from Identify Device Data Log */ u8 devslp_timing[ATA_LOG_DEVSLP_SIZE]; -- cgit v1.2.3 From f4833a519aec793cf8349bf479589d37473ef6a7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 24 Feb 2016 17:38:14 +0100 Subject: ASoC: trace: fix printing jack name After a change to the snd_jack structure, the 'name' member is no longer available in all configurations, which results in a build failure in the tracing code: include/trace/events/asoc.h: In function 'trace_event_raw_event_snd_soc_jack_report': include/trace/events/asoc.h:240:32: error: 'struct snd_jack' has no member named 'name' The name field is normally initialized from the card shortname and the jack "id" field: snprintf(jack->name, sizeof(jack->name), "%s %s", card->shortname, jack->id); This changes the tracing output to just contain the 'id' by itself, which slightly changes the output format but avoids the link error and is hopefully still enough to see what is going on. Signed-off-by: Arnd Bergmann Fixes: fe0d128c57bf ("ALSA: jack: Allow building the jack layer without input device") Signed-off-by: Mark Brown --- include/trace/events/asoc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h index 317a1ed2f4ac..9130dd5a184a 100644 --- a/include/trace/events/asoc.h +++ b/include/trace/events/asoc.h @@ -231,13 +231,13 @@ TRACE_EVENT(snd_soc_jack_report, TP_ARGS(jack, mask, val), TP_STRUCT__entry( - __string( name, jack->jack->name ) + __string( name, jack->jack->id ) __field( int, mask ) __field( int, val ) ), TP_fast_assign( - __assign_str(name, jack->jack->name); + __assign_str(name, jack->jack->id); __entry->mask = mask; __entry->val = val; ), @@ -253,12 +253,12 @@ TRACE_EVENT(snd_soc_jack_notify, TP_ARGS(jack, val), TP_STRUCT__entry( - __string( name, jack->jack->name ) + __string( name, jack->jack->id ) __field( int, val ) ), TP_fast_assign( - __assign_str(name, jack->jack->name); + __assign_str(name, jack->jack->id); __entry->val = val; ), -- cgit v1.2.3 From 7aca0c07207385cca76025cc85231519935722b9 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 26 Feb 2016 19:14:13 -0800 Subject: clocksource: Introduce clocksource_freq2mult() The clocksource_khz2mult() and clocksource_hz2mult() share similar code wihch calculates a mult from the given frequency. Both implementations in differ only in value of a frequency. This patch introduces the clocksource_freq2mult() helper with generic implementation of mult calculation to prevent code duplication. Signed-off-by: Alexander Kuleshov Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1456542854-22104-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 6013021a3b39..a307bf62974f 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -118,6 +118,23 @@ struct clocksource { /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1) +static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from) +{ + /* freq = cyc/from + * mult/2^shift = ns/cyc + * mult = ns/cyc * 2^shift + * mult = from/freq * 2^shift + * mult = from * 2^shift / freq + * mult = (from< Date: Mon, 22 Feb 2016 22:19:14 +0000 Subject: perf: Allow storage of PMU private data in event For PMUs which are not per CPU, but e.g. per package/socket, we want to be able to store a reference to the underlying per package/socket facility in the event at init time so we can avoid magic storage constructs in the PMU driver. This allows us to get rid of the per CPU dance in the intel uncore and RAPL drivers and avoids a lookup of the per package data in the perf hotpath. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Andi Kleen Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Harish Chegondi Cc: Jacob Pan Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160222221011.364140369@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f5c5a3fa2c81..a9d8cab18b00 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -468,6 +468,7 @@ struct perf_event { int group_flags; struct perf_event *group_leader; struct pmu *pmu; + void *pmu_private; enum perf_event_active_state state; unsigned int attach_state; -- cgit v1.2.3 From ff77e468535987b3d21b7bd4da15608ea3ce7d0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2016 15:27:07 +0100 Subject: sched/rt: Fix PI handling vs. sched_setscheduler() Andrea Parri reported: > I found that the following scenario (with CONFIG_RT_GROUP_SCHED=y) is not > handled correctly: > > T1 (prio = 20) > lock(rtmutex); > > T2 (prio = 20) > blocks on rtmutex (rt_nr_boosted = 0 on T1's rq) > > T1 (prio = 20) > sys_set_scheduler(prio = 0) > [new_effective_prio == oldprio] > T1 prio = 20 (rt_nr_boosted = 0 on T1's rq) > > The last step is incorrect as T1 is now boosted (c.f., rt_se_boosted()); > in particular, if we continue with > > T1 (prio = 20) > unlock(rtmutex) > wakeup(T2) > adjust_prio(T1) > [prio != rt_mutex_getprio(T1)] > dequeue(T1) > rt_nr_boosted = (unsigned long)(-1) > ... > T1 prio = 0 > > then we end up leaving rt_nr_boosted in an "inconsistent" state. > > The simple program attached could reproduce the previous scenario; note > that, as a consequence of the presence of this state, the "assertion" > > WARN_ON(!rt_nr_running && rt_nr_boosted) > > from dec_rt_group() may trigger. So normally we dequeue/enqueue tasks in sched_setscheduler(), which would ensure the accounting stays correct. However in the early PI path we fail to do so. So this was introduced at around v3.14, by: c365c292d059 ("sched: Consider pi boosting in setscheduler()") which fixed another problem exactly because that dequeue/enqueue, joy. Fix this by teaching rt about DEQUEUE_SAVE/ENQUEUE_RESTORE and have it preserve runqueue location with that option. This requires decoupling the on_rt_rq() state from being on the list. In order to allow for explicit movement during the SAVE/RESTORE, introduce {DE,EN}QUEUE_MOVE. We still must use SAVE/RESTORE in these cases to preserve other invariants. Respecting the SAVE/RESTORE flags also has the (nice) side-effect that things like sys_nice()/sys_sched_setaffinity() also do not reorder FIFO tasks (whereas they used to before this patch). Reported-by: Andrea Parri Tested-by: Andrea Parri Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index a292c4b7e94c..87e5f9886ac8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1293,6 +1293,8 @@ struct sched_rt_entity { unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; + unsigned short on_rq; + unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED -- cgit v1.2.3 From f904f58263e1df5f70feb8b283f4bbe662847334 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Feb 2016 14:54:56 +0100 Subject: sched/debug: Fix preempt_disable_ip recording for preempt_disable() The preempt_disable() invokes preempt_count_add() which saves the caller in ->preempt_disable_ip. It uses CALLER_ADDR1 which does not look for its caller but for the parent of the caller. Which means we get the correct caller for something like spin_lock() unless the architectures inlines those invocations. It is always wrong for preempt_disable() or local_bh_disable(). This patch makes the function get_lock_parent_ip() which tries CALLER_ADDR0,1,2 if the former is a locking function. This seems to record the preempt_disable() caller properly for preempt_disable() itself as well as for get_cpu_var() or local_bh_disable(). Steven asked for the get_parent_ip() -> get_lock_parent_ip() rename. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160226135456.GB18244@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 12 ++++++++++++ include/linux/sched.h | 2 -- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index c2b340e23f62..6d9df3f7e334 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled) #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) +static inline unsigned long get_lock_parent_ip(void) +{ + unsigned long addr = CALLER_ADDR0; + + if (!in_lock_functions(addr)) + return addr; + addr = CALLER_ADDR1; + if (!in_lock_functions(addr)) + return addr; + return CALLER_ADDR2; +} + #ifdef CONFIG_IRQSOFF_TRACER extern void time_hardirqs_on(unsigned long a0, unsigned long a1); extern void time_hardirqs_off(unsigned long a0, unsigned long a1); diff --git a/include/linux/sched.h b/include/linux/sched.h index 87e5f9886ac8..9519b66fc21f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active); static inline void update_cpu_load_nohz(int active) { } #endif -extern unsigned long get_parent_ip(unsigned long addr); - extern void dump_cpu_task(int cpu); struct seq_file; -- cgit v1.2.3 From b82e530290a0437522720becaf4abdf8ca4cb7d2 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 19 Feb 2016 13:49:27 -0500 Subject: locking/qspinlock: Move __ARCH_SPIN_LOCK_UNLOCKED to qspinlock_types.h Move the __ARCH_SPIN_LOCK_UNLOCKED definition from qspinlock.h into qspinlock_types.h. The definition of __ARCH_SPIN_LOCK_UNLOCKED comes from the build arch's include files; but on x86 when CONFIG_QUEUED_SPINLOCKS=y, it just it's defined in asm-generic/qspinlock.h. In most cases, this doesn't matter because linux/spinlock.h includes asm/spinlock.h, which for x86 includes asm-generic/qspinlock.h. However, any code that only includes linux/mutex.h will break, because it only includes asm/spinlock_types.h. For example, this breaks systemtap, which only includes mutex.h. Signed-off-by: Dan Streetman Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Cc: Andrew Morton Cc: Arnd Bergmann Cc: Dan Streetman Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455907767-17821-1-git-send-email-dan.streetman@canonical.com Signed-off-by: Ingo Molnar --- include/asm-generic/qspinlock.h | 5 ----- include/asm-generic/qspinlock_types.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index 39e1cb201b8e..35a52a880b2f 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -119,11 +119,6 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock) } #endif -/* - * Initializier - */ -#define __ARCH_SPIN_LOCK_UNLOCKED { ATOMIC_INIT(0) } - /* * Remapping spinlock architecture specific functions to the corresponding * queued spinlock functions. diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h index 85f888e86761..034acd0c4956 100644 --- a/include/asm-generic/qspinlock_types.h +++ b/include/asm-generic/qspinlock_types.h @@ -32,6 +32,11 @@ typedef struct qspinlock { atomic_t val; } arch_spinlock_t; +/* + * Initializier + */ +#define __ARCH_SPIN_LOCK_UNLOCKED { ATOMIC_INIT(0) } + /* * Bitfields in the atomic value: * -- cgit v1.2.3 From a528aca7f359f4b0b1d72ae406097e491a5ba9ea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 29 Feb 2016 12:12:46 -0500 Subject: use ->d_seq to get coherency between ->d_inode and ->d_flags Games with ordering and barriers are way too brittle. Just bump ->d_seq before and after updating ->d_inode and ->d_flags type bits, so that verifying ->d_seq would guarantee they are coherent. Cc: stable@vger.kernel.org # v3.13+ Signed-off-by: Al Viro --- include/linux/dcache.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 7781ce110503..c4b5f4b3f8f8 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -409,9 +409,7 @@ static inline bool d_mountpoint(const struct dentry *dentry) */ static inline unsigned __d_entry_type(const struct dentry *dentry) { - unsigned type = READ_ONCE(dentry->d_flags); - smp_rmb(); - return type & DCACHE_ENTRY_TYPE; + return dentry->d_flags & DCACHE_ENTRY_TYPE; } static inline bool d_is_miss(const struct dentry *dentry) -- cgit v1.2.3 From 090e77c391dd983c8945b8e2e16d09f378d2e334 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:23 +0000 Subject: cpu/hotplug: Restructure FROZEN state handling There are only a few callbacks which really care about FROZEN vs. !FROZEN. No need to have extra states for this. Publish the frozen state in an extra variable which is updated under the hotplug lock and let the users interested deal with it w/o imposing that extra state checks on everyone. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.334912357@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d2ca8c38f9c4..f2fb54938ee6 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -118,6 +118,7 @@ enum { #ifdef CONFIG_SMP +extern bool cpuhp_tasks_frozen; /* Need to know about CPUs going up/down? */ #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) #define cpu_notifier(fn, pri) { \ @@ -177,6 +178,7 @@ extern void cpu_maps_update_done(void); #define cpu_notifier_register_done cpu_maps_update_done #else /* CONFIG_SMP */ +#define cpuhp_tasks_frozen 0 #define cpu_notifier(fn, pri) do { (void)(fn); } while (0) #define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) -- cgit v1.2.3 From 5ba9ac8e2c45ab165e5b4a246f4821d319656e9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:27 +0000 Subject: cpu/hotplug: Add tracepoints We want to trace the hotplug machinery. Add tracepoints to track the invocation of callbacks and their result. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.593563875@linutronix.de Signed-off-by: Thomas Gleixner --- include/trace/events/cpuhp.h | 66 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 include/trace/events/cpuhp.h (limited to 'include') diff --git a/include/trace/events/cpuhp.h b/include/trace/events/cpuhp.h new file mode 100644 index 000000000000..a72bd93ec7e5 --- /dev/null +++ b/include/trace/events/cpuhp.h @@ -0,0 +1,66 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpuhp + +#if !defined(_TRACE_CPUHP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CPUHP_H + +#include + +TRACE_EVENT(cpuhp_enter, + + TP_PROTO(unsigned int cpu, + int target, + int idx, + int (*fun)(unsigned int)), + + TP_ARGS(cpu, target, idx, fun), + + TP_STRUCT__entry( + __field( unsigned int, cpu ) + __field( int, target ) + __field( int, idx ) + __field( void *, fun ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->target = target; + __entry->idx = idx; + __entry->fun = fun; + ), + + TP_printk("cpu: %04u target: %3d step: %3d (%pf)", + __entry->cpu, __entry->target, __entry->idx, __entry->fun) +); + +TRACE_EVENT(cpuhp_exit, + + TP_PROTO(unsigned int cpu, + int state, + int idx, + int ret), + + TP_ARGS(cpu, state, idx, ret), + + TP_STRUCT__entry( + __field( unsigned int, cpu ) + __field( int, state ) + __field( int, idx ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->state = state; + __entry->idx = idx; + __entry->ret = ret; + ), + + TP_printk(" cpu: %04u state: %3d step: %3d ret: %d", + __entry->cpu, __entry->state, __entry->idx, __entry->ret) +); + +#endif + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From cff7d378d3fdbb53db9b6e2578b14855f401cd41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:28 +0000 Subject: cpu/hotplug: Convert to a state machine for the control processor Move the split out steps into a callback array and let the cpu_up/down code iterate through the array functions. For now most of the callbacks are asymmetric to resemble the current hotplug maze. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.671816690@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 9 ++++----- include/linux/cpuhotplug.h | 13 +++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 include/linux/cpuhotplug.h (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f2fb54938ee6..78989f20420f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -16,6 +16,7 @@ #include #include #include +#include struct device; struct device_node; @@ -27,6 +28,9 @@ struct cpu { struct device dev; }; +extern void boot_cpu_init(void); +extern void boot_cpu_state_init(void); + extern int register_cpu(struct cpu *cpu, int num); extern struct device *get_cpu_device(unsigned cpu); extern bool cpu_is_hotpluggable(unsigned cpu); @@ -267,11 +271,6 @@ static inline int disable_nonboot_cpus(void) { return 0; } static inline void enable_nonboot_cpus(void) {} #endif /* !CONFIG_PM_SLEEP_SMP */ -enum cpuhp_state { - CPUHP_OFFLINE, - CPUHP_ONLINE, -}; - void cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h new file mode 100644 index 000000000000..d55c9e64acd7 --- /dev/null +++ b/include/linux/cpuhotplug.h @@ -0,0 +1,13 @@ +#ifndef __CPUHOTPLUG_H +#define __CPUHOTPLUG_H + +enum cpuhp_state { + CPUHP_OFFLINE, + CPUHP_CREATE_THREADS, + CPUHP_NOTIFY_PREPARE, + CPUHP_BRINGUP_CPU, + CPUHP_NOTIFY_ONLINE, + CPUHP_ONLINE, +}; + +#endif -- cgit v1.2.3 From 4baa0afc6719cbf36a1e08551484a641926b3fd1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:29 +0000 Subject: cpu/hotplug: Convert the hotplugged cpu work to a state machine Move the functions which need to run on the hotplugged processor into a state machine array and let the code iterate through these functions. In a later state, this will grow synchronization points between the control processor and the hotplugged processor, so we can move the various architecture implementations of the synchronizations to the core. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.770651526@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d55c9e64acd7..d9303cca83d3 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -6,6 +6,10 @@ enum cpuhp_state { CPUHP_CREATE_THREADS, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, + CPUHP_AP_OFFLINE, + CPUHP_AP_NOTIFY_STARTING, + CPUHP_AP_ONLINE, + CPUHP_TEARDOWN_CPU, CPUHP_NOTIFY_ONLINE, CPUHP_ONLINE, }; -- cgit v1.2.3 From 5b7aa87e0482be768486e0c2277aa4122487eb9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:33 +0000 Subject: cpu/hotplug: Implement setup/removal interface Implement function which allow to setup/remove hotplug state callbacks. The default behaviour for setup is to call the startup function for this state for (or on) all cpus which have a hotplug state >= the installed state. The default behaviour for removal is to call the teardown function for this state for (or on) all cpus which have a hotplug state >= the installed state. This includes rollback to the previous state in case of failure. A special state is CPUHP_ONLINE_DYN. Its for dynamically registering a hotplug callback pair. This is for drivers which have no dependencies to avoid that we need to allocate CPUHP states for each of them For both setup and remove helper functions are provided, which prevent the core to issue the callbacks. This simplifies the conversion of existing hotplug notifiers. [ Dynamic registering implemented by Sebastian Siewior ] Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.103464877@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d9303cca83d3..29935261b26d 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -11,7 +11,74 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_NOTIFY_ONLINE, + CPUHP_ONLINE_DYN, + CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, CPUHP_ONLINE, }; +int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)); + +/** + * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback (will be used in debug output) + * @startup: startup callback function + * @teardown: teardown callback function + * + * Installs the callback functions and invokes the startup callback on + * the present cpus which have already reached the @state. + */ +static inline int cpuhp_setup_state(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state(state, name, true, startup, teardown); +} + +/** + * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the + * callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback. + * @startup: startup callback function + * @teardown: teardown callback function + * + * Same as @cpuhp_setup_state except that no calls are executed are invoked + * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n. + */ +static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state(state, name, false, startup, teardown); +} + +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); + +/** + * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown + * @state: The state for which the calls are removed + * + * Removes the callback functions and invokes the teardown callback on + * the present cpus which have already reached the @state. + */ +static inline void cpuhp_remove_state(enum cpuhp_state state) +{ + __cpuhp_remove_state(state, true); +} + +/** + * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking + * teardown + * @state: The state for which the calls are removed + */ +static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) +{ + __cpuhp_remove_state(state, false); +} + #endif -- cgit v1.2.3 From 949338e35131c551f7bf54f48a2e3a227af6721b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:35 +0000 Subject: cpu/hotplug: Move scheduler cpu_online notifier to hotplug core Move the scheduler cpu online notifier part to the hotplug core. This is anyway the highest priority callback and we need that functionality right now for the next changes. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.200791046@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 29935261b26d..2f2e5d9711c4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -10,6 +10,7 @@ enum cpuhp_state { CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, + CPUHP_CPU_SET_ACTIVE, CPUHP_NOTIFY_ONLINE, CPUHP_ONLINE_DYN, CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, -- cgit v1.2.3 From 931ef163309ee955611f287dc65248b39a65fc9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:36 +0000 Subject: cpu/hotplug: Unpark smpboot threads from the state machine Handle the smpboot threads in the state machine. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.295777684@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 7 +------ include/linux/cpuhotplug.h | 1 + 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 78989f20420f..83f35767016d 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -78,7 +78,7 @@ enum { /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, - CPU_PRI_SMPBOOT = 9, + /* bring up workqueues before normal notifiers and down after */ CPU_PRI_WORKQUEUE_UP = 5, CPU_PRI_WORKQUEUE_DOWN = -5, @@ -172,7 +172,6 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb) } #endif -void smpboot_thread_init(void); int cpu_up(unsigned int cpu); void notify_cpu_starting(unsigned int cpu); extern void cpu_maps_update_begin(void); @@ -221,10 +220,6 @@ static inline void cpu_notifier_register_done(void) { } -static inline void smpboot_thread_init(void) -{ -} - #endif /* CONFIG_SMP */ extern struct bus_type cpu_subsys; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2f2e5d9711c4..38679106fddd 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -11,6 +11,7 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_CPU_SET_ACTIVE, + CPUHP_SMPBOOT_THREADS, CPUHP_NOTIFY_ONLINE, CPUHP_ONLINE_DYN, CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, -- cgit v1.2.3 From 1cf4f629d9d246519a1e76c021806f2a51ddba4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:39 +0000 Subject: cpu/hotplug: Move online calls to hotplugged cpu Let the hotplugged cpu invoke the setup/teardown callbacks (CPU_ONLINE/CPU_DOWN_PREPARE) itself. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.536364371@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 38679106fddd..8a715bb1e192 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -11,10 +11,12 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_CPU_SET_ACTIVE, - CPUHP_SMPBOOT_THREADS, - CPUHP_NOTIFY_ONLINE, - CPUHP_ONLINE_DYN, - CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, + CPUHP_KICK_AP_THREAD, + CPUHP_BP_ONLINE, + CPUHP_AP_SMPBOOT_THREADS, + CPUHP_AP_NOTIFY_ONLINE, + CPUHP_AP_ONLINE_DYN, + CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, CPUHP_ONLINE, }; -- cgit v1.2.3 From fc6d73d67436e7784758a831227bd019547a3f73 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:40 +0000 Subject: arch/hotplug: Call into idle with a proper state Let the non boot cpus call into idle with the corresponding hotplug state, so the hotplug core can handle the further bringup. That's a first step to convert the boot side of the hotplugged cpus to do all the synchronization with the other side through the state machine. For now it'll only start the hotplug thread and kick the full bringup of the cpu. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.614102639@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8a715bb1e192..4aa263adc536 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -13,6 +13,7 @@ enum cpuhp_state { CPUHP_CPU_SET_ACTIVE, CPUHP_KICK_AP_THREAD, CPUHP_BP_ONLINE, + CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_NOTIFY_ONLINE, CPUHP_AP_ONLINE_DYN, -- cgit v1.2.3 From 8df3e07e7f21f2ed8d001e6fabf9505946b438aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:41 +0000 Subject: cpu/hotplug: Let upcoming cpu bring itself fully up Let the upcoming cpu kick the hotplug thread and let itself complete the bringup. That way the controll side can just wait for the completion or later when we made the hotplug machinery async not care at all. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.697655464@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4aa263adc536..ad5d7fcb0130 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -10,9 +10,6 @@ enum cpuhp_state { CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, - CPUHP_CPU_SET_ACTIVE, - CPUHP_KICK_AP_THREAD, - CPUHP_BP_ONLINE, CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_NOTIFY_ONLINE, @@ -86,4 +83,10 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +#ifdef CONFIG_SMP +void cpuhp_online_idle(enum cpuhp_state state); +#else +static inline void cpuhp_online_idle(enum cpuhp_state state) { } +#endif + #endif -- cgit v1.2.3 From e69aab13117efc1987620090e539b4ebeb33a04c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:43 +0000 Subject: cpu/hotplug: Make wait for dead cpu completion based Kill the busy spinning on the control side and just wait for the hotplugged cpu to tell that it reached the dead state. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.776157858@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 5 +++-- include/linux/cpuhotplug.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 83f35767016d..91a48d1b4ca0 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -276,14 +276,15 @@ void arch_cpu_idle_enter(void); void arch_cpu_idle_exit(void); void arch_cpu_idle_dead(void); -DECLARE_PER_CPU(bool, cpu_dead_idle); - int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); +void cpuhp_report_idle_dead(void); +#else +static inline void cpuhp_report_idle_dead(void) { } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ #endif /* _LINUX_CPU_H_ */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index ad5d7fcb0130..5d68e15e46b7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -6,6 +6,7 @@ enum cpuhp_state { CPUHP_CREATE_THREADS, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, + CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, -- cgit v1.2.3 From 27d50c7eeb0f03c3d3ca72aac4d2dd487ca1f3f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:44 +0000 Subject: rcu: Make CPU_DYING_IDLE an explicit call Make the RCU CPU_DYING_IDLE callback an explicit function call, so it gets invoked at the proper place. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.870167933@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 4 +--- include/linux/notifier.h | 2 ++ include/linux/rcupdate.h | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 91a48d1b4ca0..f9b1fab4388a 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -101,9 +101,7 @@ enum { * Called on the new cpu, just before * enabling interrupts. Must not sleep, * must not fail */ -#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached - * idle loop. */ -#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, +#define CPU_BROKEN 0x000B /* CPU (unsigned)v did not die properly, * perhaps due to preemption. */ /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend diff --git a/include/linux/notifier.h b/include/linux/notifier.h index d14a4c362465..4149868de4e6 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -47,6 +47,8 @@ * runtime initialization. */ +struct notifier_block; + typedef int (*notifier_fn_t)(struct notifier_block *nb, unsigned long action, void *data); diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 14e6f47ee16f..fc46fe3ea259 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -332,9 +332,7 @@ void rcu_init(void); void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); -struct notifier_block; -int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); +void rcu_report_dead(unsigned int cpu); #ifndef CONFIG_TINY_RCU void rcu_end_inkernel_boot(void); -- cgit v1.2.3 From d027d45d8a17a4145eab2d841140e9acbb7feb59 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 7 Jun 2015 15:54:30 +0200 Subject: nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner Suggested-by: Peter Zijlstra Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/sched.h | 8 +++++ include/linux/tick.h | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..307e48375f21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -719,6 +719,10 @@ struct signal_struct { /* Earliest-expiration cache. */ struct task_cputime cputime_expires; +#ifdef CONFIG_NO_HZ_FULL + unsigned long tick_dep_mask; +#endif + struct list_head cpu_timers[3]; struct pid *tty_old_pgrp; @@ -1542,6 +1546,10 @@ struct task_struct { VTIME_SYS, } vtime_snap_whence; #endif + +#ifdef CONFIG_NO_HZ_FULL + unsigned long tick_dep_mask; +#endif unsigned long nvcsw, nivcsw; /* context switch counts */ u64 start_time; /* monotonic time in nsec */ u64 real_start_time; /* boot based time in nsec */ diff --git a/include/linux/tick.h b/include/linux/tick.h index 97fd4e543846..d60e5df8ec28 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -97,6 +97,18 @@ static inline void tick_broadcast_exit(void) tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT); } +enum tick_dep_bits { + TICK_DEP_BIT_POSIX_TIMER = 0, + TICK_DEP_BIT_PERF_EVENTS = 1, + TICK_DEP_BIT_SCHED = 2, + TICK_DEP_BIT_CLOCK_UNSTABLE = 3 +}; + +#define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) +#define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS) +#define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) +#define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) + #ifdef CONFIG_NO_HZ_COMMON extern int tick_nohz_enabled; extern int tick_nohz_tick_stopped(void); @@ -154,6 +166,72 @@ static inline int housekeeping_any_cpu(void) return cpumask_any_and(housekeeping_mask, cpu_online_mask); } +extern void tick_nohz_dep_set(enum tick_dep_bits bit); +extern void tick_nohz_dep_clear(enum tick_dep_bits bit); +extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); +extern void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit); +extern void tick_nohz_dep_set_task(struct task_struct *tsk, + enum tick_dep_bits bit); +extern void tick_nohz_dep_clear_task(struct task_struct *tsk, + enum tick_dep_bits bit); +extern void tick_nohz_dep_set_signal(struct signal_struct *signal, + enum tick_dep_bits bit); +extern void tick_nohz_dep_clear_signal(struct signal_struct *signal, + enum tick_dep_bits bit); + +/* + * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases + * on top of static keys. + */ +static inline void tick_dep_set(enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_set(bit); +} + +static inline void tick_dep_clear(enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_clear(bit); +} + +static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + if (tick_nohz_full_cpu(cpu)) + tick_nohz_dep_set_cpu(cpu, bit); +} + +static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + if (tick_nohz_full_cpu(cpu)) + tick_nohz_dep_clear_cpu(cpu, bit); +} + +static inline void tick_dep_set_task(struct task_struct *tsk, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_set_task(tsk, bit); +} +static inline void tick_dep_clear_task(struct task_struct *tsk, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_clear_task(tsk, bit); +} +static inline void tick_dep_set_signal(struct signal_struct *signal, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_set_signal(signal, bit); +} +static inline void tick_dep_clear_signal(struct signal_struct *signal, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_clear_signal(signal, bit); +} + extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); extern void tick_nohz_full_kick_all(void); @@ -166,6 +244,20 @@ static inline int housekeeping_any_cpu(void) static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } + +static inline void tick_dep_set(enum tick_dep_bits bit) { } +static inline void tick_dep_clear(enum tick_dep_bits bit) { } +static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { } +static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { } +static inline void tick_dep_set_task(struct task_struct *tsk, + enum tick_dep_bits bit) { } +static inline void tick_dep_clear_task(struct task_struct *tsk, + enum tick_dep_bits bit) { } +static inline void tick_dep_set_signal(struct signal_struct *signal, + enum tick_dep_bits bit) { } +static inline void tick_dep_clear_signal(struct signal_struct *signal, + enum tick_dep_bits bit) { } + static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } -- cgit v1.2.3 From e6e6cc22e067a6f44449aa8fd0328404079c3ba5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 11 Dec 2015 03:27:25 +0100 Subject: nohz: Use enum code for tick stop failure tracing message It makes nohz tracing more lightweight, standard and easier to parse. Examples: user_loop-2904 [007] d..1 517.701126: tick_stop: success=1 dependency=NONE user_loop-2904 [007] dn.1 518.021181: tick_stop: success=0 dependency=SCHED posix_timers-6142 [007] d..1 1739.027400: tick_stop: success=0 dependency=POSIX_TIMER user_loop-5463 [007] dN.1 1185.931939: tick_stop: success=0 dependency=PERF_EVENTS Suggested-by: Peter Zijlstra Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/tick.h | 1 + include/trace/events/timer.h | 36 +++++++++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index d60e5df8ec28..96d276a923bf 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -104,6 +104,7 @@ enum tick_dep_bits { TICK_DEP_BIT_CLOCK_UNSTABLE = 3 }; +#define TICK_DEP_MASK_NONE 0 #define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) #define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS) #define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 073b9ac245ba..51440131d337 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -328,23 +328,49 @@ TRACE_EVENT(itimer_expire, ); #ifdef CONFIG_NO_HZ_COMMON + +#define TICK_DEP_NAMES \ + tick_dep_name(NONE) \ + tick_dep_name(POSIX_TIMER) \ + tick_dep_name(PERF_EVENTS) \ + tick_dep_name(SCHED) \ + tick_dep_name_end(CLOCK_UNSTABLE) + +#undef tick_dep_name +#undef tick_dep_name_end + +#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); +#define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); + +TICK_DEP_NAMES + +#undef tick_dep_name +#undef tick_dep_name_end + +#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, +#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep } + +#define show_tick_dep_name(val) \ + __print_symbolic(val, TICK_DEP_NAMES) + TRACE_EVENT(tick_stop, - TP_PROTO(int success, char *error_msg), + TP_PROTO(int success, int dependency), - TP_ARGS(success, error_msg), + TP_ARGS(success, dependency), TP_STRUCT__entry( __field( int , success ) - __string( msg, error_msg ) + __field( int , dependency ) ), TP_fast_assign( __entry->success = success; - __assign_str(msg, error_msg); + __entry->dependency = dependency; ), - TP_printk("success=%s msg=%s", __entry->success ? "yes" : "no", __get_str(msg)) + TP_printk("success=%d dependency=%s", __entry->success, \ + show_tick_dep_name(__entry->dependency)) ); #endif -- cgit v1.2.3 From 555e0c1ef7ff49ee5ac3a1eb12de4a2e4722f63d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2015 17:42:29 +0200 Subject: perf: Migrate perf to use new tick dependency mask model Instead of providing asynchronous checks for the nohz subsystem to verify perf event tick dependency, migrate perf to the new mask. Perf needs the tick for two situations: 1) Freq events. We could set the tick dependency when those are installed on a CPU context. But setting a global dependency on top of the global freq events accounting is much easier. If people want that to be optimized, we can still refine that on the per-CPU tick dependency level. This patch dooesn't change the current behaviour anyway. 2) Throttled events: this is a per-cpu dependency. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/perf_event.h | 6 ------ include/linux/tick.h | 2 -- 2 files changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b35a61a481fa..d3ff88c13632 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1108,12 +1108,6 @@ static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } #endif -#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL) -extern bool perf_event_can_stop_tick(void); -#else -static inline bool perf_event_can_stop_tick(void) { return true; } -#endif - #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else diff --git a/include/linux/tick.h b/include/linux/tick.h index 96d276a923bf..0e63db4bc662 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -233,7 +233,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, tick_nohz_dep_clear_signal(signal, bit); } -extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(void); @@ -260,7 +259,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } -static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(void) { } #endif -- cgit v1.2.3 From 76d92ac305f23cada3a9b3c48a7ccea5f71019cb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2015 22:25:49 +0200 Subject: sched: Migrate sched to use new tick dependency mask model Instead of providing asynchronous checks for the nohz subsystem to verify sched tick dependency, migrate sched to the new mask. Everytime a task is enqueued or dequeued, we evaluate the state of the tick dependency on top of the policy of the tasks in the runqueue, by order of priority: SCHED_DEADLINE: Need the tick in order to periodically check for runtime SCHED_FIFO : Don't need the tick (no round-robin) SCHED_RR : Need the tick if more than 1 task of the same priority for round robin (simplified with checking if more than one SCHED_RR task no matter what priority). SCHED_NORMAL : Need the tick if more than 1 task for round-robin. We could optimize that further with one flag per sched policy on the tick dependency mask and perform only the checks relevant to the policy concerned by an enqueue/dequeue operation. Since the checks aren't based on the current task anymore, we could get rid of the task switch hook but it's still needed for posix cpu timers. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 307e48375f21..2b10348806d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2364,10 +2364,7 @@ static inline void wake_up_nohz_cpu(int cpu) { } #endif #ifdef CONFIG_NO_HZ_FULL -extern bool sched_can_stop_tick(void); extern u64 scheduler_tick_max_deferment(void); -#else -static inline bool sched_can_stop_tick(void) { return false; } #endif #ifdef CONFIG_SCHED_AUTOGROUP -- cgit v1.2.3 From b78783000d5cb7c5994e6742e1d1ce594bfea15b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2015 22:25:49 +0200 Subject: posix-cpu-timers: Migrate to use new tick dependency mask model Instead of providing asynchronous checks for the nohz subsystem to verify posix cpu timers tick dependency, migrate the latter to the new mask. In order to keep track of the running timers and expose the tick dependency accordingly, we must probe the timers queuing and dequeuing on threads and process lists. Unfortunately it implies both task and signal level dependencies. We should be able to further optimize this and merge all that on the task level dependency, at the cost of a bit of complexity and may be overhead. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/posix-timers.h | 3 --- include/linux/tick.h | 2 -- 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 907f3fd191ac..62d44c176071 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -128,9 +128,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer); void run_posix_cpu_timers(struct task_struct *task); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); - -bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk); - void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval); diff --git a/include/linux/tick.h b/include/linux/tick.h index 0e63db4bc662..21f73649a4dc 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -234,7 +234,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, } extern void tick_nohz_full_kick_cpu(int cpu); -extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(void); #else static inline int housekeeping_any_cpu(void) @@ -259,7 +258,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } -static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(void) { } #endif -- cgit v1.2.3 From bdfc028de1b3cd59490d5413a5c87b0fa50040c2 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 29 Feb 2016 21:17:12 +0200 Subject: net/mlx5e: Fix ethtool RX hash func configuration change We should modify TIRs explicitly to apply the new RSS configuration. The light ndo close/open calls do not "refresh" them. Fixes: 2d75b2bc8a8c ('net/mlx5e: Add ethtool RSS configuration options') Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 51f1e540fc2b..58eef02edc7e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4245,7 +4245,9 @@ struct mlx5_ifc_modify_tir_bitmask_bits { u8 reserved_at_20[0x1b]; u8 self_lb_en[0x1]; - u8 reserved_at_3c[0x3]; + u8 reserved_at_3c[0x1]; + u8 hash[0x1]; + u8 reserved_at_3e[0x1]; u8 lro[0x1]; }; -- cgit v1.2.3 From 9da0f49c8767cc0ef6101cb21156cf4380ed50dd Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:20 -0800 Subject: time: Add timekeeping snapshot code capturing system time and counter In the current timekeeping code there isn't any interface to atomically capture the current relationship between the system counter and system time. ktime_get_snapshot() returns this triple (counter, monotonic raw, realtime) in the system_time_snapshot struct. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Moved structure definitions around to clean things up, fixed cycles_t/cycle_t confusion.] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ec89d846324c..7817591af46f 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -266,6 +266,24 @@ extern void timekeeping_inject_sleeptime64(struct timespec64 *delta); extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real); +/* + * struct system_time_snapshot - simultaneous raw/real time capture with + * counter value + * @cycles: Clocksource counter value to produce the system times + * @real: Realtime system time + * @raw: Monotonic raw system time + */ +struct system_time_snapshot { + cycle_t cycles; + ktime_t real; + ktime_t raw; +}; + +/* + * Simultaneously snapshot realtime and monotonic raw clocks + */ +extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); + /* * Persistent clock related interfaces */ -- cgit v1.2.3 From ba26621e63ce6dc481d90ab9f6902e058d4ea39a Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:21 -0800 Subject: time: Remove duplicated code in ktime_get_raw_and_real() The code in ktime_get_snapshot() is a superset of the code in ktime_get_raw_and_real() code. Further, ktime_get_raw_and_real() is called only by the PPS code, pps_get_ts(). Consolidate the pps_get_ts() code into a single function calling ktime_get_snapshot() and eliminate ktime_get_raw_and_real(). A side effect of this is that the raw and real results of pps_get_ts() correspond to exactly the same clock cycle. Previously these values represented separate reads of the system clock. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall Signed-off-by: John Stultz --- include/linux/pps_kernel.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index 54bf1484d41f..35ac903956c7 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -111,22 +111,17 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt, kt->nsec = ts.tv_nsec; } -#ifdef CONFIG_NTP_PPS - static inline void pps_get_ts(struct pps_event_time *ts) { - ktime_get_raw_and_real_ts64(&ts->ts_raw, &ts->ts_real); -} + struct system_time_snapshot snap; -#else /* CONFIG_NTP_PPS */ - -static inline void pps_get_ts(struct pps_event_time *ts) -{ - ktime_get_real_ts64(&ts->ts_real); + ktime_get_snapshot(&snap); + ts->ts_real = ktime_to_timespec64(snap.real); +#ifdef CONFIG_NTP_PPS + ts->ts_raw = ktime_to_timespec64(snap.raw); +#endif } -#endif /* CONFIG_NTP_PPS */ - /* Subtract known time delay from PPS event time(s) */ static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta) { -- cgit v1.2.3 From 8006c24595cab106bcb9da12d35e32e14ff492df Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:22 -0800 Subject: time: Add driver cross timestamp interface for higher precision time synchronization ACKNOWLEDGMENT: cross timestamp code was developed by Thomas Gleixner . It has changed considerably and any mistakes are mine. The precision with which events on multiple networked systems can be synchronized using, as an example, PTP (IEEE 1588, 802.1AS) is limited by the precision of the cross timestamps between the system clock and the device (timestamp) clock. Precision here is the degree of simultaneity when capturing the cross timestamp. Currently the PTP cross timestamp is captured in software using the PTP device driver ioctl PTP_SYS_OFFSET. Reads of the device clock are interleaved with reads of the realtime clock. At best, the precision of this cross timestamp is on the order of several microseconds due to software latencies. Sub-microsecond precision is required for industrial control and some media applications. To achieve this level of precision hardware supported cross timestamping is needed. The function get_device_system_crosstimestamp() allows device drivers to return a cross timestamp with system time properly scaled to nanoseconds. The realtime value is needed to discipline that clock using PTP and the monotonic raw value is used for applications that don't require a "real" time, but need an unadjusted clock time. The get_device_system_crosstimestamp() code calls back into the driver to ensure that the system counter is within the current timekeeping update interval. Modern Intel hardware provides an Always Running Timer (ART) which is exactly related to TSC through a known frequency ratio. The ART is routed to devices on the system and is used to precisely and simultaneously capture the device clock with the ART. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Reworked to remove extra structures and simplify calling] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7817591af46f..4a2ca65fc778 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -279,6 +279,41 @@ struct system_time_snapshot { ktime_t raw; }; +/* + * struct system_device_crosststamp - system/device cross-timestamp + * (syncronized capture) + * @device: Device time + * @sys_realtime: Realtime simultaneous with device time + * @sys_monoraw: Monotonic raw simultaneous with device time + */ +struct system_device_crosststamp { + ktime_t device; + ktime_t sys_realtime; + ktime_t sys_monoraw; +}; + +/* + * struct system_counterval_t - system counter value with the pointer to the + * corresponding clocksource + * @cycles: System counter value + * @cs: Clocksource corresponding to system counter value. Used by + * timekeeping code to verify comparibility of two cycle values + */ +struct system_counterval_t { + cycle_t cycles; + struct clocksource *cs; +}; + +/* + * Get cross timestamp between system clock and device clock + */ +extern int get_device_system_crosststamp( + int (*get_time_fn)(ktime_t *device_time, + struct system_counterval_t *system_counterval, + void *ctx), + void *ctx, + struct system_device_crosststamp *xtstamp); + /* * Simultaneously snapshot realtime and monotonic raw clocks */ -- cgit v1.2.3 From 2c756feb18d9ec258dbb3a3d11c47e28820690d7 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:23 -0800 Subject: time: Add history to cross timestamp interface supporting slower devices Another representative use case of time sync and the correlated clocksource (in addition to PTP noted above) is PTP synchronized audio. In a streaming application, as an example, samples will be sent and/or received by multiple devices with a presentation time that is in terms of the PTP master clock. Synchronizing the audio output on these devices requires correlating the audio clock with the PTP master clock. The more precise this correlation is, the better the audio quality (i.e. out of sync audio sounds bad). From an application standpoint, to correlate the PTP master clock with the audio device clock, the system clock is used as a intermediate timebase. The transforms such an application would perform are: System Clock <-> Audio clock System Clock <-> Network Device Clock [<-> PTP Master Clock] Modern Intel platforms can perform a more accurate cross timestamp in hardware (ART,audio device clock). The audio driver requires ART->system time transforms -- the same as required for the network driver. These platforms offload audio processing (including cross-timestamps) to a DSP which to ensure uninterrupted audio processing, communicates and response to the host only once every millsecond. As a result is takes up to a millisecond for the DSP to receive a request, the request is processed by the DSP, the audio output hardware is polled for completion, the result is copied into shared memory, and the host is notified. All of these operation occur on a millisecond cadence. This transaction requires about 2 ms, but under heavier workloads it may take up to 4 ms. Adding a history allows these slow devices the option of providing an ART value outside of the current interval. In this case, the callback provided is an accessor function for the previously obtained counter value. If get_system_device_crosststamp() receives a counter value previous to cycle_last, it consults the history provided as an argument in history_ref and interpolates the realtime and monotonic raw system time using the provided counter value. If there are any clock discontinuities, e.g. from calling settimeofday(), the monotonic raw time is interpolated in the usual way, but the realtime clock time is adjusted by scaling the monotonic raw adjustment. When an accessor function is used a history argument *must* be provided. The history is initialized using ktime_get_snapshot() and must be called before the counter values are read. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Fixed up cycles_t/cycle_t type confusion] Signed-off-by: John Stultz --- include/linux/timekeeper_internal.h | 2 ++ include/linux/timekeeping.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 25247220b4b7..e88005459035 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -50,6 +50,7 @@ struct tk_read_base { * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval @@ -91,6 +92,7 @@ struct timekeeper { ktime_t offs_tai; s32 tai_offset; unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; ktime_t next_leap_ktime; struct timespec64 raw_time; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 4a2ca65fc778..96f37bee3bc1 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -272,11 +272,15 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @raw: Monotonic raw system time + * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { cycle_t cycles; ktime_t real; ktime_t raw; + unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; }; /* @@ -312,6 +316,7 @@ extern int get_device_system_crosststamp( struct system_counterval_t *system_counterval, void *ctx), void *ctx, + struct system_time_snapshot *history, struct system_device_crosststamp *xtstamp); /* -- cgit v1.2.3 From 58402b6e98f9059420e64b296db6c4cb6f6c1117 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 29 Feb 2016 09:02:47 +0100 Subject: [media] media.h: use hex values for range offsets, move connectors base up. Make the base offset hexadecimal to simplify debugging since the base addresses are hex too. The offsets for connectors is also changed to start after the 'reserved' range 0x10000-0x2ffff. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/media.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h index f0328524be6e..13e19a18f97f 100644 --- a/include/uapi/linux/media.h +++ b/include/uapi/linux/media.h @@ -66,25 +66,25 @@ struct media_device_info { /* * DVB entities */ -#define MEDIA_ENT_F_DTV_DEMOD (MEDIA_ENT_F_BASE + 1) -#define MEDIA_ENT_F_TS_DEMUX (MEDIA_ENT_F_BASE + 2) -#define MEDIA_ENT_F_DTV_CA (MEDIA_ENT_F_BASE + 3) -#define MEDIA_ENT_F_DTV_NET_DECAP (MEDIA_ENT_F_BASE + 4) +#define MEDIA_ENT_F_DTV_DEMOD (MEDIA_ENT_F_BASE + 0x00001) +#define MEDIA_ENT_F_TS_DEMUX (MEDIA_ENT_F_BASE + 0x00002) +#define MEDIA_ENT_F_DTV_CA (MEDIA_ENT_F_BASE + 0x00003) +#define MEDIA_ENT_F_DTV_NET_DECAP (MEDIA_ENT_F_BASE + 0x00004) /* * I/O entities */ -#define MEDIA_ENT_F_IO_DTV (MEDIA_ENT_F_BASE + 1001) -#define MEDIA_ENT_F_IO_VBI (MEDIA_ENT_F_BASE + 1002) -#define MEDIA_ENT_F_IO_SWRADIO (MEDIA_ENT_F_BASE + 1003) +#define MEDIA_ENT_F_IO_DTV (MEDIA_ENT_F_BASE + 0x01001) +#define MEDIA_ENT_F_IO_VBI (MEDIA_ENT_F_BASE + 0x01002) +#define MEDIA_ENT_F_IO_SWRADIO (MEDIA_ENT_F_BASE + 0x01003) /* * Connectors */ /* It is a responsibility of the entity drivers to add connectors and links */ -#define MEDIA_ENT_F_CONN_RF (MEDIA_ENT_F_BASE + 10001) -#define MEDIA_ENT_F_CONN_SVIDEO (MEDIA_ENT_F_BASE + 10002) -#define MEDIA_ENT_F_CONN_COMPOSITE (MEDIA_ENT_F_BASE + 10003) +#define MEDIA_ENT_F_CONN_RF (MEDIA_ENT_F_BASE + 0x30001) +#define MEDIA_ENT_F_CONN_SVIDEO (MEDIA_ENT_F_BASE + 0x30002) +#define MEDIA_ENT_F_CONN_COMPOSITE (MEDIA_ENT_F_BASE + 0x30003) /* * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and -- cgit v1.2.3 From 82e88ff1ea948d83125a8aaa7c9809f03ccc500f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Mar 2016 11:11:12 +0100 Subject: hrtimer: Revert CLOCK_MONOTONIC_RAW support Revert commits: a6e707ddbdf1: KVM: arm/arm64: timer: Switch to CLOCK_MONOTONIC_RAW 9006a01829a5: hrtimer: Catch illegal clockids 9c808765e88e: hrtimer: Add support for CLOCK_MONOTONIC_RAW Marc found out, that there are fundamental issues with that patch series because __hrtimer_get_next_event() and hrtimer_forward() need support for CLOCK_MONOTONIC_RAW. Nothing which is easily fixed, so revert the whole lot. Reported-by: Marc Zyngier Link: http://lkml.kernel.org/r/56D6CEF0.8060607@arm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index a6d64af5e73f..76dd4f0da5ca 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,7 +151,6 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, - HRTIMER_BASE_MONOTONIC_RAW, HRTIMER_MAX_CLOCK_BASES, }; -- cgit v1.2.3 From 93125094c07d8c9ec25dff5869f191b33eb9dd6e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 3 Mar 2016 14:52:51 -0300 Subject: [media] media.h: postpone connectors entities The representation of external connections got some heated discussions recently. As we're too close to the merge window, let's not set those entities into a stone. Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/media.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h index 13e19a18f97f..323f1af35062 100644 --- a/include/uapi/linux/media.h +++ b/include/uapi/linux/media.h @@ -82,10 +82,18 @@ struct media_device_info { * Connectors */ /* It is a responsibility of the entity drivers to add connectors and links */ +#ifdef __KERNEL__ + /* + * For now, it should not be used in userspace, as some + * definitions may change + */ + #define MEDIA_ENT_F_CONN_RF (MEDIA_ENT_F_BASE + 0x30001) #define MEDIA_ENT_F_CONN_SVIDEO (MEDIA_ENT_F_BASE + 0x30002) #define MEDIA_ENT_F_CONN_COMPOSITE (MEDIA_ENT_F_BASE + 0x30003) +#endif + /* * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and * MEDIA_ENT_F_OLD_SUBDEV_BASE are kept to keep backward compatibility -- cgit v1.2.3 From 88f8b1bb41c6208f81b6a480244533ded7b59493 Mon Sep 17 00:00:00 2001 From: Gabriel Fernandez Date: Mon, 29 Feb 2016 17:18:22 +0100 Subject: stmmac: Fix 'eth0: No PHY found' regression This patch manages the case when you have an Ethernet MAC with a "fixed link", and not connected to a normal MDIO-managed PHY device. The test of phy_bus_name was not helpful because it was never affected and replaced by the mdio test node. Signed-off-by: Gabriel Fernandez Acked-by: Giuseppe Cavallaro Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index eead8ab93c0a..881a79d52467 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -100,6 +100,7 @@ struct plat_stmmacenet_data { int interface; struct stmmac_mdio_bus_data *mdio_bus_data; struct device_node *phy_node; + struct device_node *mdio_node; struct stmmac_dma_cfg *dma_cfg; int clk_csr; int has_gmac; -- cgit v1.2.3 From 1837b2e2bcd23137766555a63867e649c0b637f0 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Mon, 29 Feb 2016 15:03:33 -0800 Subject: mld, igmp: Fix reserved tailroom calculation The current reserved_tailroom calculation fails to take hlen and tlen into account. skb: [__hlen__|__data____________|__tlen___|__extra__] ^ ^ head skb_end_offset In this representation, hlen + data + tlen is the size passed to alloc_skb. "extra" is the extra space made available in __alloc_skb because of rounding up by kmalloc. We can reorder the representation like so: [__hlen__|__data____________|__extra__|__tlen___] ^ ^ head skb_end_offset The maximum space available for ip headers and payload without fragmentation is min(mtu, data + extra). Therefore, reserved_tailroom = data + extra + tlen - min(mtu, data + extra) = skb_end_offset - hlen - min(mtu, skb_end_offset - hlen - tlen) = skb_tailroom - min(mtu, skb_tailroom - tlen) ; after skb_reserve(hlen) Compare the second line to the current expression: reserved_tailroom = skb_end_offset - min(mtu, skb_end_offset) and we can see that hlen and tlen are not taken into account. The min() in the third line can be expanded into: if mtu < skb_tailroom - tlen: reserved_tailroom = skb_tailroom - mtu else: reserved_tailroom = tlen Depending on hlen, tlen, mtu and the number of multicast address records, the current code may output skbs that have less tailroom than dev->needed_tailroom or it may output more skbs than needed because not all space available is used. Fixes: 4c672e4b ("ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs") Signed-off-by: Benjamin Poirier Acked-by: Hannes Frederic Sowa Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/skbuff.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4ce9ff7086f4..d3fcd4591ce4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1985,6 +1985,30 @@ static inline void skb_reserve(struct sk_buff *skb, int len) skb->tail += len; } +/** + * skb_tailroom_reserve - adjust reserved_tailroom + * @skb: buffer to alter + * @mtu: maximum amount of headlen permitted + * @needed_tailroom: minimum amount of reserved_tailroom + * + * Set reserved_tailroom so that headlen can be as large as possible but + * not larger than mtu and tailroom cannot be smaller than + * needed_tailroom. + * The required headroom should already have been reserved before using + * this function. + */ +static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu, + unsigned int needed_tailroom) +{ + SKB_LINEAR_ASSERT(skb); + if (mtu < skb_tailroom(skb) - needed_tailroom) + /* use at most mtu */ + skb->reserved_tailroom = skb_tailroom(skb) - mtu; + else + /* use up to all available space */ + skb->reserved_tailroom = needed_tailroom; +} + #define ENCAP_TYPE_ETHER 0 #define ENCAP_TYPE_IPPROTO 1 -- cgit v1.2.3 From fbe093ac9f0201939279cdfe8b0fce20ce5ef7a9 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 3 Mar 2016 14:20:14 -0300 Subject: [media] media: Sanitise the reserved fields of the G_TOPOLOGY IOCTL arguments The argument structs are used in arrays for G_TOPOLOGY IOCTL. The arguments themselves do not need to be aligned to a power of two, but aligning them up to the largest basic type alignment (u64) on common ABIs is a good thing to do. The patch changes the size of the reserved fields to 5 or 6 u32's and aligns the size of the struct to 8 bytes so we do no longer depend on the compiler to perform the alignment. While at it, add __attribute__ ((packed)) to these structs as well. Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/media.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h index 323f1af35062..625b38f65764 100644 --- a/include/uapi/linux/media.h +++ b/include/uapi/linux/media.h @@ -297,14 +297,14 @@ struct media_v2_entity { __u32 id; char name[64]; /* FIXME: move to a property? (RFC says so) */ __u32 function; /* Main function of the entity */ - __u16 reserved[12]; -}; + __u32 reserved[6]; +} __attribute__ ((packed)); /* Should match the specific fields at media_intf_devnode */ struct media_v2_intf_devnode { __u32 major; __u32 minor; -}; +} __attribute__ ((packed)); struct media_v2_interface { __u32 id; @@ -316,22 +316,22 @@ struct media_v2_interface { struct media_v2_intf_devnode devnode; __u32 raw[16]; }; -}; +} __attribute__ ((packed)); struct media_v2_pad { __u32 id; __u32 entity_id; __u32 flags; - __u16 reserved[9]; -}; + __u32 reserved[5]; +} __attribute__ ((packed)); struct media_v2_link { __u32 id; __u32 source_id; __u32 sink_id; __u32 flags; - __u32 reserved[5]; -}; + __u32 reserved[6]; +} __attribute__ ((packed)); struct media_v2_topology { __u64 topology_version; @@ -351,7 +351,7 @@ struct media_v2_topology { __u32 num_links; __u32 reserved4; __u64 ptr_links; -}; +} __attribute__ ((packed)); static inline void __user *media_get_uptr(__u64 arg) { -- cgit v1.2.3 From 7bcd79ac50d9d83350a835bdb91c04ac9e098412 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 26 Feb 2016 23:40:50 +0800 Subject: block: bio: introduce helpers to get the 1st and last bvec The bio passed to bio_will_gap() may be fast cloned from upper layer(dm, md, bcache, fs, ...), or from bio splitting in block core. Unfortunately bio_will_gap() just figures out the last bvec via 'bi_io_vec[prev->bi_vcnt - 1]' directly, and this way is obviously wrong. This patch introduces two helpers for getting the first and last bvec of one bio for fixing the issue. Cc: stable@vger.kernel.org Reported-by: Sagi Grimberg Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 5349e6816cbb..cb6888824108 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -310,6 +310,43 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit) bio->bi_flags &= ~(1U << bit); } +static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) +{ + *bv = bio_iovec(bio); +} + +static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) +{ + struct bvec_iter iter = bio->bi_iter; + int idx; + + if (!bio_flagged(bio, BIO_CLONED)) { + *bv = bio->bi_io_vec[bio->bi_vcnt - 1]; + return; + } + + if (unlikely(!bio_multiple_segments(bio))) { + *bv = bio_iovec(bio); + return; + } + + bio_advance_iter(bio, &iter, iter.bi_size); + + if (!iter.bi_bvec_done) + idx = iter.bi_idx - 1; + else /* in the middle of bvec */ + idx = iter.bi_idx; + + *bv = bio->bi_io_vec[idx]; + + /* + * iter.bi_bvec_done records actual length of the last bvec + * if this bio ends in the middle of one io vector + */ + if (iter.bi_bvec_done) + bv->bv_len = iter.bi_bvec_done; +} + enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ -- cgit v1.2.3 From e0af29171aa8912e1ca95023b75ef336cd70d661 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 26 Feb 2016 23:40:51 +0800 Subject: block: check virt boundary in bio_will_gap() In the following patch, the way for figuring out the last bvec will be changed with a bit cost introduced, so return immediately if the queue doesn't have virt boundary limit. Actually most of devices have not this limit. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4571ef1a12a9..cd06a41581b3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1372,6 +1372,13 @@ static inline void put_dev_sector(Sector p) page_cache_release(p.v); } +static inline bool __bvec_gap_to_prev(struct request_queue *q, + struct bio_vec *bprv, unsigned int offset) +{ + return offset || + ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); +} + /* * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. @@ -1381,18 +1388,17 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, { if (!queue_virt_boundary(q)) return false; - return offset || - ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); + return __bvec_gap_to_prev(q, bprv, offset); } static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, struct bio *next) { - if (!bio_has_data(prev)) + if (!bio_has_data(prev) || !queue_virt_boundary(q)) return false; - return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1], - next->bi_io_vec[0].bv_offset); + return __bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1], + next->bi_io_vec[0].bv_offset); } static inline bool req_gap_back_merge(struct request *req, struct bio *bio) -- cgit v1.2.3 From 25e71a99f10e444cd00bb2ebccb11e1c9fb672b1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 26 Feb 2016 23:40:52 +0800 Subject: block: get the 1st and last bvec via helpers This patch applies the two introduced helpers to figure out the 1st and last bvec, and fixes the original way after bio splitting. Cc: stable@vger.kernel.org Reported-by: Sagi Grimberg Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cd06a41581b3..d7f6bca707ef 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1394,11 +1394,16 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, struct bio *next) { - if (!bio_has_data(prev) || !queue_virt_boundary(q)) - return false; + if (bio_has_data(prev) && queue_virt_boundary(q)) { + struct bio_vec pb, nb; + + bio_get_last_bvec(prev, &pb); + bio_get_first_bvec(next, &nb); - return __bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1], - next->bi_io_vec[0].bv_offset); + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); + } + + return false; } static inline bool req_gap_back_merge(struct request *req, struct bio *bio) -- cgit v1.2.3 From a1a0e23e49037c23ea84bc8cc146a03584d13577 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Feb 2016 18:28:53 -0500 Subject: writeback: flush inode cgroup wb switches instead of pinning super_block If cgroup writeback is in use, inodes can be scheduled for asynchronous wb switching. Before 5ff8eaac1636 ("writeback: keep superblock pinned during cgroup writeback association switches"), this could race with umount leading to super_block being destroyed while inodes are pinned for wb switching. 5ff8eaac1636 fixed it by bumping s_active while wb switches are in flight; however, this allowed in-flight wb switches to make umounts asynchronous when the userland expected synchronosity - e.g. fsck immediately following umount may fail because the device is still busy. This patch removes the problematic super_block pinning and instead makes generic_shutdown_super() flush in-flight wb switches. wb switches are now executed on a dedicated isw_wq so that they can be flushed and isw_nr_in_flight keeps track of the number of in-flight wb switches so that flushing can be avoided in most cases. v2: Move cgroup_writeback_umount() further below and add MS_ACTIVE check in inode_switch_wbs() as Jan an Al suggested. Signed-off-by: Tejun Heo Reported-by: Tahsin Erdogan Cc: Jan Kara Cc: Al Viro Link: http://lkml.kernel.org/g/CAAeU0aNCq7LGODvVGRU-oU_o-6enii5ey0p1c26D1ZzYwkDc5A@mail.gmail.com Fixes: 5ff8eaac1636 ("writeback: keep superblock pinned during cgroup writeback association switches") Cc: stable@vger.kernel.org #v4.5 Reviewed-by: Jan Kara Tested-by: Tahsin Erdogan Signed-off-by: Jens Axboe --- include/linux/writeback.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b333c945e571..d0b5ca5d4e08 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_io(struct writeback_control *wbc, struct page *page, size_t bytes); +void cgroup_writeback_umount(void); /** * inode_attach_wb - associate an inode with its wb @@ -301,6 +302,10 @@ static inline void wbc_account_io(struct writeback_control *wbc, { } +static inline void cgroup_writeback_umount(void) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* -- cgit v1.2.3 From f21018427cb007a0894c36ad702990ab639cbbb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2016 14:43:45 -0700 Subject: block: fix blk_rq_get_max_sectors for driver private requests Driver private request types should not get the artifical cap for the FS requests. This is important to use the full device capabilities for internal command or NVMe pass through commands. Signed-off-by: Christoph Hellwig Reported-by: Jeff Lien Tested-by: Jeff Lien Reviewed-by: Keith Busch Updated by me to use an explicit check for the one command type that does support extended checking, instead of relying on the ordering of the enum command values - as suggested by Keith. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d7f6bca707ef..413c84fbc4ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -895,7 +895,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq) { struct request_queue *q = rq->q; - if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC)) + if (unlikely(rq->cmd_type != REQ_TYPE_FS)) return q->limits.max_hw_sectors; if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD)) -- cgit v1.2.3 From 719f1aa4a67199a3c4c68a03f94e5ec44d9d5f82 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:25 -0800 Subject: ptp: Add PTP_SYS_OFFSET_PRECISE for driver crosstimestamping Currently, network /system cross-timestamping is performed in the PTP_SYS_OFFSET ioctl. The PTP clock driver reads gettimeofday() and the gettime64() callback provided by the driver. The cross-timestamp is best effort where the latency between the capture of system time (getnstimeofday()) and the device time (driver callback) may be significant. The getcrosststamp() callback and corresponding PTP_SYS_OFFSET_PRECISE ioctl allows the driver to perform this device/system correlation when for example cross timestamp hardware is available. Modern Intel systems can do this for onboard Ethernet controllers using the ART counter. There is virtually zero latency between captures of the ART and network device clock. The capabilities ioctl (PTP_CLOCK_GETCAPS), is augmented allowing applications to query whether or not drivers implement the getcrosststamp callback, providing more precise cross timestamping. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Acked-by: Richard Cochran Signed-off-by: Christopher S. Hall [jstultz: Commit subject tweaks] Signed-off-by: John Stultz --- include/linux/ptp_clock_kernel.h | 8 ++++++++ include/uapi/linux/ptp_clock.h | 13 ++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index b8b73066d137..6b15e168148a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -38,6 +38,7 @@ struct ptp_clock_request { }; }; +struct system_device_crosststamp; /** * struct ptp_clock_info - decribes a PTP hardware clock * @@ -67,6 +68,11 @@ struct ptp_clock_request { * @gettime64: Reads the current time from the hardware clock. * parameter ts: Holds the result. * + * @getcrosststamp: Reads the current time from the hardware clock and + * system clock simultaneously. + * parameter cts: Contains timestamp (device,system) pair, + * where system time is realtime and monotonic. + * * @settime64: Set the current time on the hardware clock. * parameter ts: Time value to set. * @@ -105,6 +111,8 @@ struct ptp_clock_info { int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta); int (*adjtime)(struct ptp_clock_info *ptp, s64 delta); int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts); + int (*getcrosststamp)(struct ptp_clock_info *ptp, + struct system_device_crosststamp *cts); int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts); int (*enable)(struct ptp_clock_info *ptp, struct ptp_clock_request *request, int on); diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index f0b7bfe5da92..ac6dded80ffa 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -51,7 +51,9 @@ struct ptp_clock_caps { int n_per_out; /* Number of programmable periodic signals. */ int pps; /* Whether the clock supports a PPS callback. */ int n_pins; /* Number of input/output pins. */ - int rsv[14]; /* Reserved for future use. */ + /* Whether the clock supports precise system-device cross timestamps */ + int cross_timestamping; + int rsv[13]; /* Reserved for future use. */ }; struct ptp_extts_request { @@ -81,6 +83,13 @@ struct ptp_sys_offset { struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1]; }; +struct ptp_sys_offset_precise { + struct ptp_clock_time device; + struct ptp_clock_time sys_realtime; + struct ptp_clock_time sys_monoraw; + unsigned int rsv[4]; /* Reserved for future use. */ +}; + enum ptp_pin_function { PTP_PF_NONE, PTP_PF_EXTTS, @@ -124,6 +133,8 @@ struct ptp_pin_desc { #define PTP_SYS_OFFSET _IOW(PTP_CLK_MAGIC, 5, struct ptp_sys_offset) #define PTP_PIN_GETFUNC _IOWR(PTP_CLK_MAGIC, 6, struct ptp_pin_desc) #define PTP_PIN_SETFUNC _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc) +#define PTP_SYS_OFFSET_PRECISE \ + _IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise) struct ptp_extts_event { struct ptp_clock_time t; /* Time event occured. */ -- cgit v1.2.3 From e57cbaf0eb006eaa207395f3bfd7ce52c1b5539c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 3 Mar 2016 17:18:20 -0500 Subject: tracing: Do not have 'comm' filter override event 'comm' field Commit 9f61668073a8d "tracing: Allow triggers to filter for CPU ids and process names" added a 'comm' filter that will filter events based on the current tasks struct 'comm'. But this now hides the ability to filter events that have a 'comm' field too. For example, sched_migrate_task trace event. That has a 'comm' field of the task to be migrated. echo 'comm == "bash"' > events/sched_migrate_task/filter will now filter all sched_migrate_task events for tasks named "bash" that migrates other tasks (in interrupt context), instead of seeing when "bash" itself gets migrated. This fix requires a couple of changes. 1) Change the look up order for filter predicates to look at the events fields before looking at the generic filters. 2) Instead of basing the filter function off of the "comm" name, have the generic "comm" filter have its own filter_type (FILTER_COMM). Test against the type instead of the name to assign the filter function. 3) Add a new "COMM" filter that works just like "comm" but will filter based on the current task, even if the trace event contains a "comm" field. Do the same for "cpu" field, adding a FILTER_CPU and a filter "CPU". Cc: stable@vger.kernel.org # v4.3+ Fixes: 9f61668073a8d "tracing: Allow triggers to filter for CPU ids and process names" Reported-by: Matt Fleming Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 429fdfc3baf5..925730bc9fc1 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -568,6 +568,8 @@ enum { FILTER_DYN_STRING, FILTER_PTR_STRING, FILTER_TRACE_FN, + FILTER_COMM, + FILTER_CPU, }; extern int trace_event_raw_init(struct trace_event_call *call); -- cgit v1.2.3 From 5ea5c5e0a7f70b256417d3b6e36bd9851504babd Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 14 Feb 2016 18:06:41 +0800 Subject: ceph: initial CEPH_FEATURE_FS_FILE_LAYOUT_V2 support Add support for the format change of MClientReply/MclientCaps. Also add code that denies access to inodes with pool_ns layouts. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index c1ef6f14e7be..15151f3c4120 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -75,6 +75,7 @@ #define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ // duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5 #define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */ +#define CEPH_FEATURE_FS_FILE_LAYOUT_V2 (1ULL<<58) /* file_layout_t */ /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature -- cgit v1.2.3 From 72f9f3fdc928dc3ecd223e801b32d930b662b6ed Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Mon, 7 Mar 2016 12:27:04 +0100 Subject: sched/deadline: Remove dl_new from struct sched_dl_entity The dl_new field of struct sched_dl_entity is currently used to identify new deadline tasks, so that their deadline and runtime can be properly initialised. However, these tasks can be easily identified by checking if their deadline is smaller than the current time when they switch to SCHED_DEADLINE. So, dl_new can be removed by introducing this check in switched_to_dl(); this allows to simplify the SCHED_DEADLINE code. Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1457350024-7825-2-git-send-email-luca.abeni@unitn.it Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9519b66fc21f..838a89a78332 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1333,10 +1333,6 @@ struct sched_dl_entity { * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * - * @dl_new tells if a new instance arrived. If so we must - * start executing it with full runtime and reset its absolute - * deadline; - * * @dl_boosted tells if we are boosted due to DI. If so we are * outside bandwidth enforcement mechanism (but only until we * exit the critical section); @@ -1344,7 +1340,7 @@ struct sched_dl_entity { * @dl_yielded tells if task gave up the cpu before consuming * all its available runtime during the last job. */ - int dl_throttled, dl_new, dl_boosted, dl_yielded; + int dl_throttled, dl_boosted, dl_yielded; /* * Bandwidth enforcement timer. Each -deadline task has its -- cgit v1.2.3 From f6e45661f9be546811b62b2b01f32f4bf0c436c0 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 22 Jan 2016 18:34:22 -0800 Subject: dma, mm/pat: Rename dma_*_writecombine() to dma_*_wc() Rename dma_*_writecombine() to dma_*_wc(), so that the naming is coherent across the various write-combining APIs. Keep the old names for compatibility for a while, these can be removed at a later time. A guard is left to enable backporting of the rename, and later remove of the old mapping defines seemlessly. Build tested successfully with allmodconfig. The following Coccinelle SmPL patch was used for this simple transformation: @ rename_dma_alloc_writecombine @ expression dev, size, dma_addr, gfp; @@ -dma_alloc_writecombine(dev, size, dma_addr, gfp) +dma_alloc_wc(dev, size, dma_addr, gfp) @ rename_dma_free_writecombine @ expression dev, size, cpu_addr, dma_addr; @@ -dma_free_writecombine(dev, size, cpu_addr, dma_addr) +dma_free_wc(dev, size, cpu_addr, dma_addr) @ rename_dma_mmap_writecombine @ expression dev, vma, cpu_addr, dma_addr, size; @@ -dma_mmap_writecombine(dev, vma, cpu_addr, dma_addr, size) +dma_mmap_wc(dev, vma, cpu_addr, dma_addr, size) We also keep the old names as compatibility helpers, and guard against their definition to make backporting easier. Generated-by: Coccinelle SmPL Suggested-by: Ingo Molnar Signed-off-by: Luis R. Rodriguez Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bhelgaas@google.com Cc: bp@suse.de Cc: dan.j.williams@intel.com Cc: daniel.vetter@ffwll.ch Cc: dhowells@redhat.com Cc: julia.lawall@lip6.fr Cc: konrad.wilk@oracle.com Cc: linux-fbdev@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: luto@amacapital.net Cc: mst@redhat.com Cc: tomi.valkeinen@ti.com Cc: toshi.kani@hp.com Cc: vinod.koul@intel.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1453516462-4844-1-git-send-email-mcgrof@do-not-panic.com Signed-off-by: Ingo Molnar --- include/linux/dma-mapping.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 75857cda38e9..471e064af29f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -641,31 +641,40 @@ static inline void dmam_release_declared_memory(struct device *dev) } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ -static inline void *dma_alloc_writecombine(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t gfp) +static inline void *dma_alloc_wc(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t gfp) { DEFINE_DMA_ATTRS(attrs); dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); return dma_alloc_attrs(dev, size, dma_addr, gfp, &attrs); } +#ifndef dma_alloc_writecombine +#define dma_alloc_writecombine dma_alloc_wc +#endif -static inline void dma_free_writecombine(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_addr) +static inline void dma_free_wc(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr) { DEFINE_DMA_ATTRS(attrs); dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); return dma_free_attrs(dev, size, cpu_addr, dma_addr, &attrs); } +#ifndef dma_free_writecombine +#define dma_free_writecombine dma_free_wc +#endif -static inline int dma_mmap_writecombine(struct device *dev, - struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, - size_t size) +static inline int dma_mmap_wc(struct device *dev, + struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, + size_t size) { DEFINE_DMA_ATTRS(attrs); dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs); } +#ifndef dma_mmap_writecombine +#define dma_mmap_writecombine dma_mmap_wc +#endif #ifdef CONFIG_NEED_DMA_MAP_STATE #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME -- cgit v1.2.3 From dc17147de328a74bbdee67c1bf37d2f1992de756 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 9 Mar 2016 11:58:41 -0500 Subject: tracing: Fix check for cpu online when event is disabled Commit f37755490fe9b ("tracepoints: Do not trace when cpu is offline") added a check to make sure that tracepoints only get called when the cpu is online, as it uses rcu_read_lock_sched() for protection. Commit 3a630178fd5f3 ("tracing: generate RCU warnings even when tracepoints are disabled") added lockdep checks (including rcu checks) for events that are not enabled to catch possible RCU issues that would only be triggered if a trace event was enabled. Commit f37755490fe9b only stopped the warnings when the trace event was enabled but did not prevent warnings if the trace event was called when disabled. To fix this, the cpu online check is moved to where the condition is added to the trace event. This will place the cpu online check in all places that it may be used now and in the future. Cc: stable@vger.kernel.org # v3.18+ Fixes: f37755490fe9b ("tracepoints: Do not trace when cpu is offline") Fixes: 3a630178fd5f3 ("tracing: generate RCU warnings even when tracepoints are disabled") Reported-by: Sudeep Holla Tested-by: Sudeep Holla Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index acfdbf353a0b..be586c632a0c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -134,9 +134,6 @@ extern void syscall_unregfunc(void); void *it_func; \ void *__data; \ \ - if (!cpu_online(raw_smp_processor_id())) \ - return; \ - \ if (!(cond)) \ return; \ prercu; \ @@ -343,15 +340,19 @@ extern void syscall_unregfunc(void); * "void *__data, proto" as the callback prototype. */ #define DECLARE_TRACE_NOARGS(name) \ - __DECLARE_TRACE(name, void, , 1, void *__data, __data) + __DECLARE_TRACE(name, void, , \ + cpu_online(raw_smp_processor_id()), \ + void *__data, __data) #define DECLARE_TRACE(name, proto, args) \ - __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1, \ - PARAMS(void *__data, proto), \ - PARAMS(__data, args)) + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()), \ + PARAMS(void *__data, proto), \ + PARAMS(__data, args)) #define DECLARE_TRACE_CONDITION(name, proto, args, cond) \ - __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \ + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ PARAMS(void *__data, proto), \ PARAMS(__data, args)) -- cgit v1.2.3 From d77a117e6871ff78a06def46583d23752593de60 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 9 Mar 2016 14:08:10 -0800 Subject: list: kill list_force_poison() Given we have uninitialized list_heads being passed to list_add() it will always be the case that those uninitialized values randomly trigger the poison value. Especially since a list_add() operation will seed the stack with the poison value for later stack allocations to trip over. For example, see these two false positive reports: list_add attempted on force-poisoned entry WARNING: at lib/list_debug.c:34 [..] NIP [c00000000043c390] __list_add+0xb0/0x150 LR [c00000000043c38c] __list_add+0xac/0x150 Call Trace: __list_add+0xac/0x150 (unreliable) __down+0x4c/0xf8 down+0x68/0x70 xfs_buf_lock+0x4c/0x150 [xfs] list_add attempted on force-poisoned entry(0000000000000500), new->next == d0000000059ecdb0, new->prev == 0000000000000500 WARNING: at lib/list_debug.c:33 [..] NIP [c00000000042db78] __list_add+0xa8/0x140 LR [c00000000042db74] __list_add+0xa4/0x140 Call Trace: __list_add+0xa4/0x140 (unreliable) rwsem_down_read_failed+0x6c/0x1a0 down_read+0x58/0x60 xfs_log_commit_cil+0x7c/0x600 [xfs] Fixes: commit 5c2c2587b132 ("mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup") Signed-off-by: Dan Williams Reported-by: Eryu Guan Tested-by: Eryu Guan Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/list.h b/include/linux/list.h index 30cf4200ab40..5356f4d661a7 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -113,17 +113,6 @@ extern void __list_del_entry(struct list_head *entry); extern void list_del(struct list_head *entry); #endif -#ifdef CONFIG_DEBUG_LIST -/* - * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one - * of the pages it allocates is ever passed to list_add() - */ -extern void list_force_poison(struct list_head *entry); -#else -/* fallback to the less strict LIST_POISON* definitions */ -#define list_force_poison list_del -#endif - /** * list_replace - replace old entry by new one * @old : the element to be replaced -- cgit v1.2.3 From e3ae116339f9a0c77523abc95e338fa405946e07 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Mar 2016 14:08:15 -0800 Subject: kasan: add functions to clear stack poison Functions which the compiler has instrumented for ASAN place poison on the stack shadow upon entry and remove this poison prior to returning. In some cases (e.g. hotplug and idle), CPUs may exit the kernel a number of levels deep in C code. If there are any instrumented functions on this critical path, these will leave portions of the idle thread stack shadow poisoned. If a CPU returns to the kernel via a different path (e.g. a cold entry), then depending on stack frame layout subsequent calls to instrumented functions may use regions of the stack with stale poison, resulting in (spurious) KASAN splats to the console. Contemporary GCCs always add stack shadow poisoning when ASAN is enabled, even when asked to not instrument a function [1], so we can't simply annotate functions on the critical path to avoid poisoning. Instead, this series explicitly removes any stale poison before it can be hit. In the common hotplug case we clear the entire stack shadow in common code, before a CPU is brought online. On architectures which perform a cold return as part of cpu idle may retain an architecture-specific amount of stack contents. To retain the poison for this retained context, the arch code must call the core KASAN code, passing a "watermark" stack pointer value beyond which shadow will be cleared. Architectures which don't perform a cold return as part of idle do not need any additional code. This patch (of 3): Functions which the compiler has instrumented for KASAN place poison on the stack shadow upon entry and remove this poision prior to returning. In some cases (e.g. hotplug and idle), CPUs may exit the kernel a number of levels deep in C code. If there are any instrumented functions on this critical path, these will leave portions of the stack shadow poisoned. If a CPU returns to the kernel via a different path (e.g. a cold entry), then depending on stack frame layout subsequent calls to instrumented functions may use regions of the stack with stale poison, resulting in (spurious) KASAN splats to the console. To avoid this, we must clear stale poison from the stack prior to instrumented functions being called. This patch adds functions to the KASAN core for removing poison from (portions of) a task's stack. These will be used by subsequent patches to avoid problems with hotplug and idle. Signed-off-by: Mark Rutland Acked-by: Catalin Marinas Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Lorenzo Pieralisi Cc: Will Deacon Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 4b9f85c963d0..0fdc798e3ff7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -1,6 +1,7 @@ #ifndef _LINUX_KASAN_H #define _LINUX_KASAN_H +#include #include struct kmem_cache; @@ -13,7 +14,6 @@ struct vm_struct; #include #include -#include extern unsigned char kasan_zero_page[PAGE_SIZE]; extern pte_t kasan_zero_pte[PTRS_PER_PTE]; @@ -43,6 +43,8 @@ static inline void kasan_disable_current(void) void kasan_unpoison_shadow(const void *address, size_t size); +void kasan_unpoison_task_stack(struct task_struct *task); + void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); @@ -66,6 +68,8 @@ void kasan_free_shadow(const struct vm_struct *vm); static inline void kasan_unpoison_shadow(const void *address, size_t size) {} +static inline void kasan_unpoison_task_stack(struct task_struct *task) {} + static inline void kasan_enable_current(void) {} static inline void kasan_disable_current(void) {} -- cgit v1.2.3 From d6b7eaeb03421139e32800324ef04ab50bba886d Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 9 Mar 2016 14:08:38 -0800 Subject: dma-mapping: avoid oops when parameter cpu_addr is null To keep consistent with kfree, which tolerate ptr is NULL. We do this because sometimes we may use goto statement, so that success and failure case can share parts of the code. But unfortunately, dma_free_coherent called with parameter cpu_addr is null will cause oops, such as showed below: Unable to handle kernel paging request at virtual address ffffffc020d3b2b8 pgd = ffffffc083a61000 [ffffffc020d3b2b8] *pgd=0000000000000000, *pud=0000000000000000 CPU: 4 PID: 1489 Comm: malloc_dma_1 Tainted: G O 4.1.12 #1 Hardware name: ARM64 (DT) PC is at __dma_free_coherent.isra.10+0x74/0xc8 LR is at __dma_free+0x9c/0xb0 Process malloc_dma_1 (pid: 1489, stack limit = 0xffffffc0837fc020) [...] Call trace: __dma_free_coherent.isra.10+0x74/0xc8 __dma_free+0x9c/0xb0 malloc_dma+0x104/0x158 [dma_alloc_coherent_mtmalloc] kthread+0xec/0xfc Signed-off-by: Zhen Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 75857cda38e9..728ef074602a 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -386,7 +386,7 @@ static inline void dma_free_attrs(struct device *dev, size_t size, if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) return; - if (!ops->free) + if (!ops->free || !cpu_addr) return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); -- cgit v1.2.3 From b2cd27448b33de9069d580d8f229efef434b64e6 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 5 Mar 2016 07:13:39 -0300 Subject: [media] media-device: map new functions into old types for legacy API The legacy media controller userspace API exposes entity types that carry both type and function information. The new API replaces the type with a function. It preserves backward compatibility by defining legacy functions for the existing types and using them in drivers. This works fine, as long as newer entity functions won't be added. Unfortunately, some tools, like media-ctl with --print-dot argument rely on the now legacy MEDIA_ENT_T_V4L2_SUBDEV and MEDIA_ENT_T_DEVNODE numeric ranges to identify what entities will be shown. Also, if the entity doesn't match those ranges, it will ignore the major/minor information on devnodes, and won't be getting the devnode name via udev or sysfs. As we're now adding devices outside the old range, the legacy ioctl needs to map the new entity functions into a type at the old range, or otherwise we'll have a regression. Detected on all released media-ctl versions (e. g. versions <= 1.10). Fix this by deriving the type from the function to emulate the legacy API if the function isn't in the legacy functions range. Reported-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/media.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h index 625b38f65764..a8e3a8c0d85a 100644 --- a/include/uapi/linux/media.h +++ b/include/uapi/linux/media.h @@ -120,7 +120,7 @@ struct media_device_info { #define MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN MEDIA_ENT_F_OLD_SUBDEV_BASE -#ifndef __KERNEL__ +#if !defined(__KERNEL__) || defined(__NEED_MEDIA_LEGACY_API) /* * Legacy symbols used to avoid userspace compilation breakages @@ -133,6 +133,10 @@ struct media_device_info { #define MEDIA_ENT_TYPE_MASK 0x00ff0000 #define MEDIA_ENT_SUBTYPE_MASK 0x0000ffff +/* End of the old subdev reserved numberspace */ +#define MEDIA_ENT_T_DEVNODE_UNKNOWN (MEDIA_ENT_T_DEVNODE | \ + MEDIA_ENT_SUBTYPE_MASK) + #define MEDIA_ENT_T_DEVNODE MEDIA_ENT_F_OLD_BASE #define MEDIA_ENT_T_DEVNODE_V4L MEDIA_ENT_F_IO_V4L #define MEDIA_ENT_T_DEVNODE_FB (MEDIA_ENT_T_DEVNODE + 2) -- cgit v1.2.3 From 90d0f0f11588ec692c12f9009089b398be395184 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 12 Mar 2016 22:56:19 +0800 Subject: block: don't optimize for non-cloned bio in bio_get_last_bvec() For !BIO_CLONED bio, we can use .bi_vcnt safely, but it doesn't mean we can just simply return .bi_io_vec[.bi_vcnt - 1] because the start postion may have been moved in the middle of the bvec, such as splitting in the middle of bvec. Fixes: 7bcd79ac50d9(block: bio: introduce helpers to get the 1st and last bvec) Cc: stable@vger.kernel.org Reported-by: Kent Overstreet Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index cb6888824108..88bc64f00bb5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -320,11 +320,6 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) struct bvec_iter iter = bio->bi_iter; int idx; - if (!bio_flagged(bio, BIO_CLONED)) { - *bv = bio->bi_io_vec[bio->bi_vcnt - 1]; - return; - } - if (unlikely(!bio_multiple_segments(bio))) { *bv = bio_iovec(bio); return; -- cgit v1.2.3