From 0fc09f920983f61be625658c62cc40ac25a7b3a5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 23 Jul 2018 08:37:50 -0600 Subject: blk-mq: export setting request completion state This is preparing for drivers that want to directly alter the state of their requests. No functional change here. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e3147eb74222..ca3f2c2edd85 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -287,6 +287,20 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); +/** + * blk_mq_mark_complete() - Set request state to complete + * @rq: request to set to complete state + * + * Returns true if request state was successfully set to complete. If + * successful, the caller is responsibile for seeing this request is ended, as + * blk_mq_complete_request will not work again. + */ +static inline bool blk_mq_mark_complete(struct request *rq) +{ + return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) == + MQ_RQ_IN_FLIGHT; +} + /* * Driver command data is immediately after the request. So subtract request * size to get back to the original request, add request size to get the PDU. -- cgit v1.2.3 From 62cedf3e60af03e47849fe2bd6a03ec179422a8a Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Fri, 20 Jul 2018 10:39:13 +0200 Subject: locking/rtmutex: Allow specifying a subclass for nested locking Needed for annotating rt_mutex locks. Tested-by: John Sperbeck Signed-off-by: Peter Rosin Signed-off-by: Peter Zijlstra (Intel) Cc: Davidlohr Bueso Cc: Deepa Dinamani Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Chang Cc: Peter Zijlstra Cc: Philippe Ombredanne Cc: Thomas Gleixner Cc: Will Deacon Cc: Wolfram Sang Link: http://lkml.kernel.org/r/20180720083914.1950-2-peda@axentia.se Signed-off-by: Ingo Molnar --- include/linux/rtmutex.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 1b92a28dd672..6fd615a0eea9 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -106,7 +106,14 @@ static inline int rt_mutex_is_locked(struct rt_mutex *lock) extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); extern void rt_mutex_destroy(struct rt_mutex *lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); +#define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) +#else extern void rt_mutex_lock(struct rt_mutex *lock); +#define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) +#endif + extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); extern int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout); -- cgit v1.2.3 From 6cbc304f2f360f25cc8607817239d6f4a2fd3dc5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 May 2018 15:48:41 +0200 Subject: perf/x86/intel: Fix unwind errors from PEBS entries (mk-II) Vince reported the perf_fuzzer giving various unwinder warnings and Josh reported: > Deja vu. Most of these are related to perf PEBS, similar to the > following issue: > > b8000586c90b ("perf/x86/intel: Cure bogus unwind from PEBS entries") > > This is basically the ORC version of that. setup_pebs_sample_data() is > assembling a franken-pt_regs which ORC isn't happy about. RIP is > inconsistent with some of the other registers (like RSP and RBP). And where the previous unwinder only needed BP,SP ORC also requires IP. But we cannot spoof IP because then the sample will get displaced, entirely negating the point of PEBS. So cure the whole thing differently by doing the unwind early; this does however require a means to communicate we did the unwind early. We (ab)use an unused sample_type bit for this, which we set on events that fill out the data->callchain before the normal perf_prepare_sample(). Debugged-by: Josh Poimboeuf Reported-by: Vince Weaver Tested-by: Josh Poimboeuf Tested-by: Prashant Bhole Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1fa12887ec02..87f6db437e4a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1130,6 +1130,7 @@ extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); +extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); -- cgit v1.2.3 From 73c8d8945505acdcbae137c2e00a1232e0be709f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 14 Jul 2018 01:28:15 +0900 Subject: ring_buffer: tracing: Inherit the tracing setting to next ring buffer Maintain the tracing on/off setting of the ring_buffer when switching to the trace buffer snapshot. Taking a snapshot is done by swapping the backup ring buffer (max_tr_buffer). But since the tracing on/off setting is defined by the ring buffer, when swapping it, the tracing on/off setting can also be changed. This causes a strange result like below: /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 0 > tracing_on /sys/kernel/debug/tracing # cat tracing_on 0 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 0 We don't touch tracing_on, but snapshot changes tracing_on setting each time. This is an anomaly, because user doesn't know that each "ring_buffer" stores its own tracing-enable state and the snapshot is done by swapping ring buffers. Link: http://lkml.kernel.org/r/153149929558.11274.11730609978254724394.stgit@devbox Cc: Ingo Molnar Cc: Shuah Khan Cc: Tom Zanussi Cc: Hiraku Toyooka Cc: stable@vger.kernel.org Fixes: debdd57f5145 ("tracing: Make a snapshot feature available from userspace") Signed-off-by: Masami Hiramatsu [ Updated commit log and comment in the code ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b72ebdff0b77..003d09ab308d 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -165,6 +165,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_off(struct ring_buffer *buffer); void ring_buffer_record_on(struct ring_buffer *buffer); int ring_buffer_record_is_on(struct ring_buffer *buffer); +int ring_buffer_record_is_set_on(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); -- cgit v1.2.3 From b512719f771a82180211c9a315b8a7f628832b3d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Jul 2018 16:37:08 -0700 Subject: delayacct: fix crash in delayacct_blkio_end() after delayacct init failure While forking, if delayacct init fails due to memory shortage, it continues expecting all delayacct users to check task->delays pointer against NULL before dereferencing it, which all of them used to do. Commit c96f5471ce7d ("delayacct: Account blkio completion on the correct task"), while updating delayacct_blkio_end() to take the target task instead of always using %current, made the function test NULL on %current->delays and then continue to operated on @p->delays. If %current succeeded init while @p didn't, it leads to the following crash. BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 IP: __delayacct_blkio_end+0xc/0x40 PGD 8000001fd07e1067 P4D 8000001fd07e1067 PUD 1fcffbb067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 4 PID: 25774 Comm: QIOThread0 Not tainted 4.16.0-9_fbk1_rc2_1180_g6b593215b4d7 #9 RIP: 0010:__delayacct_blkio_end+0xc/0x40 Call Trace: try_to_wake_up+0x2c0/0x600 autoremove_wake_function+0xe/0x30 __wake_up_common+0x74/0x120 wake_up_page_bit+0x9c/0xe0 mpage_end_io+0x27/0x70 blk_update_request+0x78/0x2c0 scsi_end_request+0x2c/0x1e0 scsi_io_completion+0x20b/0x5f0 blk_mq_complete_request+0xa2/0x100 ata_scsi_qc_complete+0x79/0x400 ata_qc_complete_multiple+0x86/0xd0 ahci_handle_port_interrupt+0xc9/0x5c0 ahci_handle_port_intr+0x54/0xb0 ahci_single_level_irq_intr+0x3b/0x60 __handle_irq_event_percpu+0x43/0x190 handle_irq_event_percpu+0x20/0x50 handle_irq_event+0x2a/0x50 handle_edge_irq+0x80/0x1c0 handle_irq+0xaf/0x120 do_IRQ+0x41/0xc0 common_interrupt+0xf/0xf Fix it by updating delayacct_blkio_end() check @p->delays instead. Link: http://lkml.kernel.org/r/20180724175542.GP1934745@devbig577.frc2.facebook.com Fixes: c96f5471ce7d ("delayacct: Account blkio completion on the correct task") Signed-off-by: Tejun Heo Reported-by: Dave Jones Debugged-by: Dave Jones Reviewed-by: Andrew Morton Cc: Josh Snyder Cc: [4.15+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index e6c0448ebcc7..31c865d1842e 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -124,7 +124,7 @@ static inline void delayacct_blkio_start(void) static inline void delayacct_blkio_end(struct task_struct *p) { - if (current->delays) + if (p->delays) __delayacct_blkio_end(p); delayacct_clear_flag(DELAYACCT_PF_BLKIO); } -- cgit v1.2.3 From 027232da7c7c1c7f04383f93bd798e475dde5285 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 26 Jul 2018 16:37:25 -0700 Subject: mm: introduce vma_init() Not all VMAs allocated with vm_area_alloc(). Some of them allocated on stack or in data segment. The new helper can be use to initialize VMA properly regardless where it was allocated. Link: http://lkml.kernel.org/r/20180724121139.62570-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Reviewed-by: Andrew Morton Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d3a3842316b8..31540f166987 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -452,6 +452,12 @@ struct vm_operations_struct { unsigned long addr); }; +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +{ + vma->vm_mm = mm; + INIT_LIST_HEAD(&vma->anon_vma_chain); +} + struct mmu_gather; struct inode; -- cgit v1.2.3 From bfd40eaff5abb9f62c8ef94ca13ed0d94a560f10 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 26 Jul 2018 16:37:35 -0700 Subject: mm: fix vma_is_anonymous() false-positives vma_is_anonymous() relies on ->vm_ops being NULL to detect anonymous VMA. This is unreliable as ->mmap may not set ->vm_ops. False-positive vma_is_anonymous() may lead to crashes: next ffff8801ce5e7040 prev ffff8801d20eca50 mm ffff88019c1e13c0 prot 27 anon_vma ffff88019680cdd8 vm_ops 0000000000000000 pgoff 0 file ffff8801b2ec2d00 private_data 0000000000000000 flags: 0xff(read|write|exec|shared|mayread|maywrite|mayexec|mayshare) ------------[ cut here ]------------ kernel BUG at mm/memory.c:1422! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 18486 Comm: syz-executor3 Not tainted 4.18.0-rc3+ #136 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:zap_pmd_range mm/memory.c:1421 [inline] RIP: 0010:zap_pud_range mm/memory.c:1466 [inline] RIP: 0010:zap_p4d_range mm/memory.c:1487 [inline] RIP: 0010:unmap_page_range+0x1c18/0x2220 mm/memory.c:1508 Call Trace: unmap_single_vma+0x1a0/0x310 mm/memory.c:1553 zap_page_range_single+0x3cc/0x580 mm/memory.c:1644 unmap_mapping_range_vma mm/memory.c:2792 [inline] unmap_mapping_range_tree mm/memory.c:2813 [inline] unmap_mapping_pages+0x3a7/0x5b0 mm/memory.c:2845 unmap_mapping_range+0x48/0x60 mm/memory.c:2880 truncate_pagecache+0x54/0x90 mm/truncate.c:800 truncate_setsize+0x70/0xb0 mm/truncate.c:826 simple_setattr+0xe9/0x110 fs/libfs.c:409 notify_change+0xf13/0x10f0 fs/attr.c:335 do_truncate+0x1ac/0x2b0 fs/open.c:63 do_sys_ftruncate+0x492/0x560 fs/open.c:205 __do_sys_ftruncate fs/open.c:215 [inline] __se_sys_ftruncate fs/open.c:213 [inline] __x64_sys_ftruncate+0x59/0x80 fs/open.c:213 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Reproducer: #include #include #include #include #include #include #include #include #include #include #include #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) #define KCOV_ENABLE _IO('c', 100) #define KCOV_DISABLE _IO('c', 101) #define COVER_SIZE (1024<<10) #define KCOV_TRACE_PC 0 #define KCOV_TRACE_CMP 1 int main(int argc, char **argv) { int fd; unsigned long *cover; system("mount -t debugfs none /sys/kernel/debug"); fd = open("/sys/kernel/debug/kcov", O_RDWR); ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE); cover = mmap(NULL, COVER_SIZE * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); munmap(cover, COVER_SIZE * sizeof(unsigned long)); cover = mmap(NULL, COVER_SIZE * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); memset(cover, 0, COVER_SIZE * sizeof(unsigned long)); ftruncate(fd, 3UL << 20); return 0; } This can be fixed by assigning anonymous VMAs own vm_ops and not relying on it being NULL. If ->mmap() failed to set ->vm_ops, mmap_region() will set it to dummy_vm_ops. This way we will have non-NULL ->vm_ops for all VMAs. Link: http://lkml.kernel.org/r/20180724121139.62570-4-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: syzbot+3f84280d52be9b7083cc@syzkaller.appspotmail.com Acked-by: Linus Torvalds Reviewed-by: Andrew Morton Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 31540f166987..7ba6d356d18f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -454,10 +454,18 @@ struct vm_operations_struct { static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { + static const struct vm_operations_struct dummy_vm_ops = {}; + vma->vm_mm = mm; + vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); } +static inline void vma_set_anonymous(struct vm_area_struct *vma) +{ + vma->vm_ops = NULL; +} + struct mmu_gather; struct inode; -- cgit v1.2.3 From fa3fc2ad99b4f025446d1cff589a8d2dd7db92f2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 26 Jul 2018 16:37:38 -0700 Subject: include/linux/eventfd.h: include linux/errno.h The new gasket staging driver ran into a randconfig build failure when CONFIG_EVENTFD is disabled: In file included from drivers/staging/gasket/gasket_interrupt.h:11, from drivers/staging/gasket/gasket_interrupt.c:4: include/linux/eventfd.h: In function 'eventfd_ctx_fdget': include/linux/eventfd.h:51:9: error: implicit declaration of function 'ERR_PTR' [-Werror=implicit-function-declaration] I can't see anything wrong with including eventfd.h before err.h, so the easiest fix is to make it possible to do this by including the file where it is needed. Link: http://lkml.kernel.org/r/20180724110737.3985088-1-arnd@arndb.de Fixes: 9a69f5087ccc ("drivers/staging: Gasket driver framework + Apex driver") Signed-off-by: Arnd Bergmann Cc: Eric Biggers Cc: Al Viro Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/eventfd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 7094718b653b..ffcc7724ca21 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -11,6 +11,7 @@ #include #include +#include /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining -- cgit v1.2.3 From 8b11ec1b5ffb54f71cb5a5e5c8c4d36e5d113085 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 1 Aug 2018 13:43:38 -0700 Subject: mm: do not initialize TLB stack vma's with vma_init() Commit 2c4541e24c55 ("mm: use vma_init() to initialize VMAs on stack and data segments") tried to initialize various left-over ad-hoc vma's "properly", but actually made things worse for the temporary vma's used for TLB flushing. vma_init() doesn't actually initialize all of the vma, just a few fields, so doing something like - struct vm_area_struct vma = { .vm_mm = tlb->mm, }; + struct vm_area_struct vma; + + vma_init(&vma, tlb->mm); was actually very bad: instead of having a nicely initialized vma with every field but "vm_mm" zeroed, you'd have an entirely uninitialized vma with only a couple of fields initialized. And they weren't even fields that the code in question mostly cared about. The flush_tlb_range() function takes a "struct vma" rather than a "struct mm_struct", because a few architectures actually care about what kind of range it is - being able to only do an ITLB flush if it's a range that doesn't have data accesses enabled, for example. And all the normal users already have the vma for doing the range invalidation. But a few people want to call flush_tlb_range() with a range they just made up, so they also end up using a made-up vma. x86 just has a special "flush_tlb_mm_range()" function for this, but other architectures (arm and ia64) do the "use fake vma" thing instead, and thus got caught up in the vma_init() changes. At the same time, the TLB flushing code really doesn't care about most other fields in the vma, so vma_init() is just unnecessary and pointless. This fixes things by having an explicit "this is just an initializer for the TLB flush" initializer macro, which is used by the arm/arm64/ia64 people who mis-use this interface with just a dummy vma. Fixes: 2c4541e24c55 ("mm: use vma_init() to initialize VMAs on stack and data segments") Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: Kirill Shutemov Cc: Andrew Morton Cc: John Stultz Cc: Hugh Dickins Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ba6d356d18f..68a5121694ef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -466,6 +466,9 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) vma->vm_ops = NULL; } +/* flush_tlb_range() takes a vma, not a mm, and can care about flags */ +#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } + struct mmu_gather; struct inode; -- cgit v1.2.3