From 5298d4bfe80f6ae6ae2777bcd1357b0022d98573 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jan 2022 07:56:14 +0100 Subject: unicode: clean up the Kconfig symbol confusion Turn the CONFIG_UNICODE symbol into a tristate that generates some always built in code and remove the confusing CONFIG_UNICODE_UTF8_DATA symbol. Note that a lot of the IS_ENABLED() checks could be turned from cpp statements into normal ifs, but this change is intended to be fairly mechanic, so that should be cleaned up later. Fixes: 2b3d04787012 ("unicode: Add utf8-data module") Reported-by: Linus Torvalds Reviewed-by: Eric Biggers Signed-off-by: Christoph Hellwig Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6d..fdac22d16c2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1490,7 +1490,7 @@ struct super_block { #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif -- cgit v1.2.3 From ebb7fb1557b1d03b906b668aa2164b51e6b7d19a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 26 Jan 2022 09:19:20 -0800 Subject: xfs, iomap: limit individual ioend chain lengths in writeback Trond Myklebust reported soft lockups in XFS IO completion such as this: watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106] CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1 Workqueue: xfs-conv/md127 xfs_end_io [xfs] RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20 Call Trace: wake_up_page_bit+0x8a/0x110 iomap_finish_ioend+0xd7/0x1c0 iomap_finish_ioends+0x7f/0xb0 xfs_end_ioend+0x6b/0x100 [xfs] xfs_end_io+0xb9/0xe0 [xfs] process_one_work+0x1a7/0x360 worker_thread+0x1fa/0x390 kthread+0x116/0x130 ret_from_fork+0x35/0x40 Ioends are processed as an atomic completion unit when all the chained bios in the ioend have completed their IO. Logically contiguous ioends can also be merged and completed as a single, larger unit. Both of these things can be problematic as both the bio chains per ioend and the size of the merged ioends processed as a single completion are both unbound. If we have a large sequential dirty region in the page cache, write_cache_pages() will keep feeding us sequential pages and we will keep mapping them into ioends and bios until we get a dirty page at a non-sequential file offset. These large sequential runs can will result in bio and ioend chaining to optimise the io patterns. The pages iunder writeback are pinned within these chains until the submission chaining is broken, allowing the entire chain to be completed. This can result in huge chains being processed in IO completion context. We get deep bio chaining if we have large contiguous physical extents. We will keep adding pages to the current bio until it is full, then we'll chain a new bio to keep adding pages for writeback. Hence we can build bio chains that map millions of pages and tens of gigabytes of RAM if the page cache contains big enough contiguous dirty file regions. This long bio chain pins those pages until the final bio in the chain completes and the ioend can iterate all the chained bios and complete them. OTOH, if we have a physically fragmented file, we end up submitting one ioend per physical fragment that each have a small bio or bio chain attached to them. We do not chain these at IO submission time, but instead we chain them at completion time based on file offset via iomap_ioend_try_merge(). Hence we can end up with unbound ioend chains being built via completion merging. XFS can then do COW remapping or unwritten extent conversion on that merged chain, which involves walking an extent fragment at a time and running a transaction to modify the physical extent information. IOWs, we merge all the discontiguous ioends together into a contiguous file range, only to then process them individually as discontiguous extents. This extent manipulation is computationally expensive and can run in a tight loop, so merging logically contiguous but physically discontigous ioends gains us nothing except for hiding the fact the fact we broke the ioends up into individual physical extents at submission and then need to loop over those individual physical extents at completion. Hence we need to have mechanisms to limit ioend sizes and to break up completion processing of large merged ioend chains: 1. bio chains per ioend need to be bound in length. Pure overwrites go straight to iomap_finish_ioend() in softirq context with the exact bio chain attached to the ioend by submission. Hence the only way to prevent long holdoffs here is to bound ioend submission sizes because we can't reschedule in softirq context. 2. iomap_finish_ioends() has to handle unbound merged ioend chains correctly. This relies on any one call to iomap_finish_ioend() being bound in runtime so that cond_resched() can be issued regularly as the long ioend chain is processed. i.e. this relies on mechanism #1 to limit individual ioend sizes to work correctly. 3. filesystems have to loop over the merged ioends to process physical extent manipulations. This means they can loop internally, and so we break merging at physical extent boundaries so the filesystem can easily insert reschedule points between individual extent manipulations. Signed-off-by: Dave Chinner Reported-and-tested-by: Trond Myklebust Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b55bd49e55f5..97a3a2edb585 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -263,9 +263,11 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ + u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ + sector_t io_sector; /* start sector of ioend */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ }; -- cgit v1.2.3 From ef9989afda73332df566852d6e9ca695c05f10ce Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:22 +0000 Subject: kvm: add guest_state_{enter,exit}_irqoff() When transitioning to/from guest mode, it is necessary to inform lockdep, tracing, and RCU in a specific order, similar to the requirements for transitions to/from user mode. Additionally, it is necessary to perform vtime accounting for a window around running the guest, with RCU enabled, such that timer interrupts taken from the guest can be accounted as guest time. Most architectures don't handle all the necessary pieces, and a have a number of common bugs, including unsafe usage of RCU during the window between guest_enter() and guest_exit(). On x86, this was dealt with across commits: 87fa7f3e98a1310e ("x86/kvm: Move context tracking where it belongs") 0642391e2139a2c1 ("x86/kvm/vmx: Add hardirq tracing to guest enter/exit") 9fc975e9efd03e57 ("x86/kvm/svm: Add hardirq tracing on guest enter/exit") 3ebccdf373c21d86 ("x86/kvm/vmx: Move guest enter/exit into .noinstr.text") 135961e0a7d555fc ("x86/kvm/svm: Move guest enter/exit into .noinstr.text") 160457140187c5fb ("KVM: x86: Defer vtime accounting 'til after IRQ handling") bc908e091b326467 ("KVM: x86: Consolidate guest enter/exit logic to common helpers") ... but those fixes are specific to x86, and as the resulting logic (while correct) is split across generic helper functions and x86-specific helper functions, it is difficult to see that the entry/exit accounting is balanced. This patch adds generic helpers which architectures can use to handle guest entry/exit consistently and correctly. The guest_{enter,exit}() helpers are split into guest_timing_{enter,exit}() to perform vtime accounting, and guest_context_{enter,exit}() to perform the necessary context tracking and RCU management. The existing guest_{enter,exit}() heleprs are left as wrappers of these. Atop this, new guest_state_enter_irqoff() and guest_state_exit_irqoff() helpers are added to handle the ordering of lockdep, tracing, and RCU manageent. These are inteneded to mirror exit_to_user_mode() and enter_from_user_mode(). Subsequent patches will migrate architectures over to the new helpers, following a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); This sequences handles all of the above correctly, and more clearly balances the entry and exit portions, making it easier to understand. The existing helpers are marked as deprecated, and will be removed once all architectures have been converted. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Reviewed-by: Paolo Bonzini Reviewed-by: Nicolas Saenz Julienne Message-Id: <20220201132926.3301912-2-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 112 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f079820f52b5..b3810976a27f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -368,8 +370,11 @@ struct kvm_vcpu { u64 last_used_slot_gen; }; -/* must be called with irqs disabled */ -static __always_inline void guest_enter_irqoff(void) +/* + * Start accounting time towards a guest. + * Must be called before entering guest context. + */ +static __always_inline void guest_timing_enter_irqoff(void) { /* * This is running in ioctl context so its safe to assume that it's the @@ -378,7 +383,18 @@ static __always_inline void guest_enter_irqoff(void) instrumentation_begin(); vtime_account_guest_enter(); instrumentation_end(); +} +/* + * Enter guest context and enter an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_enter_irqoff(void) +{ /* * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -394,16 +410,79 @@ static __always_inline void guest_enter_irqoff(void) } } -static __always_inline void guest_exit_irqoff(void) +/* + * Deprecated. Architectures should move to guest_timing_enter_irqoff() and + * guest_state_enter_irqoff(). + */ +static __always_inline void guest_enter_irqoff(void) +{ + guest_timing_enter_irqoff(); + guest_context_enter_irqoff(); +} + +/** + * guest_state_enter_irqoff - Fixup state when entering a guest + * + * Entry to a guest will enable interrupts, but the kernel state is interrupts + * disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Tell lockdep that interrupts are enabled + * + * Invoked from architecture specific code before entering a guest. + * Must be called with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_enter_irqoff() before this. + * + * Note: this is analogous to exit_to_user_mode(). + */ +static __always_inline void guest_state_enter_irqoff(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_context_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +/* + * Exit guest context and exit an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_exit_irqoff(void) { context_tracking_guest_exit(); +} +/* + * Stop accounting time towards a guest. + * Must be called after exiting guest context. + */ +static __always_inline void guest_timing_exit_irqoff(void) +{ instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } +/* + * Deprecated. Architectures should move to guest_state_exit_irqoff() and + * guest_timing_exit_irqoff(). + */ +static __always_inline void guest_exit_irqoff(void) +{ + guest_context_exit_irqoff(); + guest_timing_exit_irqoff(); +} + static inline void guest_exit(void) { unsigned long flags; @@ -413,6 +492,33 @@ static inline void guest_exit(void) local_irq_restore(flags); } +/** + * guest_state_exit_irqoff - Establish state when returning from guest mode + * + * Entry from a guest disables interrupts, but guest mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + * + * Invoked from architecture specific code after exiting a guest. + * Must be invoked with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_exit_irqoff() after this. + * + * Note: this is analogous to enter_from_user_mode(). + */ +static __always_inline void guest_state_exit_irqoff(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_context_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* -- cgit v1.2.3 From bee9f65523218e3baeeecde9295c8fbe9bc08e0a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:50 +0000 Subject: netfs, cachefiles: Add a method to query presence of data in the cache Add a netfs_cache_ops method by which a network filesystem can ask the cache about what data it has available and where so that it can make a multipage read more efficient. Signed-off-by: David Howells cc: linux-cachefs@redhat.com Acked-by: Jeff Layton Reviewed-by: Rohith Surabattula Signed-off-by: Steve French --- include/linux/netfs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index b46c39d98bbd..614f22213e21 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -244,6 +244,13 @@ struct netfs_cache_ops { int (*prepare_write)(struct netfs_cache_resources *cres, loff_t *_start, size_t *_len, loff_t i_size, bool no_space_allocated_yet); + + /* Query the occupancy of the cache in a region, returning where the + * next chunk of data starts and how long it is. + */ + int (*query_occupancy)(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len); }; struct readahead_control; -- cgit v1.2.3 From 6d5c900eb64107001e91e1f46bddc254dded8a59 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 24 Jan 2022 09:22:41 -0800 Subject: net/mlx5e: Use struct_group() for memcpy() region In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Use struct_group() in struct vlan_ethhdr around members h_dest and h_source, so they can be referenced together. This will allow memcpy() and sizeof() to more easily reason about sizes, improve readability, and avoid future warnings about writing beyond the end of h_dest. "pahole" shows no size nor member offset changes to struct vlan_ethhdr. "objdump -d" shows no object code changes. Fixes: 34802a42b352 ("net/mlx5e: Do not modify the TX SKB") Signed-off-by: Kees Cook Signed-off-by: Saeed Mahameed --- include/linux/if_vlan.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 8420fe504927..2be4dd7e90a9 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -46,8 +46,10 @@ struct vlan_hdr { * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { - unsigned char h_dest[ETH_ALEN]; - unsigned char h_source[ETH_ALEN]; + struct_group(addrs, + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + ); __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; -- cgit v1.2.3 From 1148836fd3226c20de841084aba24184d4fbbe77 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 2 Feb 2022 14:55:29 +0100 Subject: Revert "fbdev: Garbage collect fbdev scrolling acceleration, part 1 (from TODO list)" This reverts commit b3ec8cdf457e5e63d396fe1346cc788cf7c1b578. Revert the second (of 2) commits which disabled scrolling acceleration in fbcon/fbdev. It introduced a regression for fbdev-supported graphic cards because of the performance penalty by doing screen scrolling by software instead of using the existing graphic card 2D hardware acceleration. Console scrolling acceleration was disabled by dropping code which checked at runtime the driver hardware capabilities for the BINFO_HWACCEL_COPYAREA or FBINFO_HWACCEL_FILLRECT flags and if set, it enabled scrollmode SCROLL_MOVE which uses hardware acceleration to move screen contents. After dropping those checks scrollmode was hard-wired to SCROLL_REDRAW instead, which forces all graphic cards to redraw every character at the new screen position when scrolling. This change effectively disabled all hardware-based scrolling acceleration for ALL drivers, because now all kind of 2D hardware acceleration (bitblt, fillrect) in the drivers isn't used any longer. The original commit message mentions that only 3 DRM drivers (nouveau, omapdrm and gma500) used hardware acceleration in the past and thus code for checking and using scrolling acceleration is obsolete. This statement is NOT TRUE, because beside the DRM drivers there are around 35 other fbdev drivers which depend on fbdev/fbcon and still provide hardware acceleration for fbdev/fbcon. The original commit message also states that syzbot found lots of bugs in fbcon and thus it's "often the solution to just delete code and remove features". This is true, and the bugs - which actually affected all users of fbcon, including DRM - were fixed, or code was dropped like e.g. the support for software scrollback in vgacon (commit 973c096f6a85). So to further analyze which bugs were found by syzbot, I've looked through all patches in drivers/video which were tagged with syzbot or syzkaller back to year 2005. The vast majority fixed the reported issues on a higher level, e.g. when screen is to be resized, or when font size is to be changed. The few ones which touched driver code fixed a real driver bug, e.g. by adding a check. But NONE of those patches touched code of either the SCROLL_MOVE or the SCROLL_REDRAW case. That means, there was no real reason why SCROLL_MOVE had to be ripped-out and just SCROLL_REDRAW had to be used instead. The only reason I can imagine so far was that SCROLL_MOVE wasn't used by DRM and as such it was assumed that it could go away. That argument completely missed the fact that SCROLL_MOVE is still heavily used by fbdev (non-DRM) drivers. Some people mention that using memcpy() instead of the hardware acceleration is pretty much the same speed. But that's not true, at least not for older graphic cards and machines where we see speed decreases by factor 10 and more and thus this change leads to console responsiveness way worse than before. That's why the original commit is to be reverted. By reverting we reintroduce hardware-based scrolling acceleration and fix the performance regression for fbdev drivers. There isn't any impact on DRM when reverting those patches. Signed-off-by: Helge Deller Acked-by: Geert Uytterhoeven Acked-by: Sven Schnelle Cc: stable@vger.kernel.org # v5.16+ Signed-off-by: Helge Deller Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220202135531.92183-2-deller@gmx.de --- include/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 3da95842b207..02f362c661c8 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -262,7 +262,7 @@ struct fb_ops { /* Draws a rectangle */ void (*fb_fillrect) (struct fb_info *info, const struct fb_fillrect *rect); - /* Copy data from area to another. Obsolete. */ + /* Copy data from area to another */ void (*fb_copyarea) (struct fb_info *info, const struct fb_copyarea *region); /* Draws a image to the display */ void (*fb_imageblit) (struct fb_info *info, const struct fb_image *image); -- cgit v1.2.3 From e1d2699b96793d19388e302fa095e0da2c145701 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Jan 2022 22:10:52 -0500 Subject: NFS: Avoid duplicate uncached readdir calls on eof If we've reached the end of the directory, then cache that information in the context so that we don't need to do an uncached readdir in order to rediscover that fact. Fixes: 794092c57f89 ("NFS: Do uncached readdir when we're seeking a cookie in an empty page cache") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 02aa49323d1d..68f81d8d36de 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -107,6 +107,7 @@ struct nfs_open_dir_context { __u64 dup_cookie; pgoff_t page_index; signed char duped; + bool eof; }; /* -- cgit v1.2.3 From 2ea88716369ac9a7486a8cb309d6bf1239ea156c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 23 Jan 2022 17:27:47 +0100 Subject: libceph: make recv path in secure mode work the same as send path The recv path of secure mode is intertwined with that of crc mode. While it's slightly more efficient that way (the ciphertext is read into the destination buffer and decrypted in place, thus avoiding two potentially heavy memory allocations for the bounce buffer and the corresponding sg array), it isn't really amenable to changes. Sacrifice that edge and align with the send path which always uses a full-sized bounce buffer (currently there is no other way -- if the kernel crypto API ever grows support for streaming (piecewise) en/decryption for GCM [1], we would be able to easily take advantage of that on both sides). [1] https://lore.kernel.org/all/20141225202830.GA18794@gondor.apana.org.au/ Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/messenger.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index ff99ce094cfa..6c6b6ea52bb8 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -383,6 +383,10 @@ struct ceph_connection_v2_info { struct ceph_gcm_nonce in_gcm_nonce; struct ceph_gcm_nonce out_gcm_nonce; + struct page **in_enc_pages; + int in_enc_page_cnt; + int in_enc_resid; + int in_enc_i; struct page **out_enc_pages; int out_enc_page_cnt; int out_enc_resid; -- cgit v1.2.3 From 038b8d1d1ab1cce11a158d30bf080ff41a2cfd15 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 30 Dec 2021 15:13:32 +0100 Subject: libceph: optionally use bounce buffer on recv path in crc mode Both msgr1 and msgr2 in crc mode are zero copy in the sense that message data is read from the socket directly into the destination buffer. We assume that the destination buffer is stable (i.e. remains unchanged while it is being read to) though. Otherwise, CRC errors ensue: libceph: read_partial_message 0000000048edf8ad data crc 1063286393 != exp. 228122706 libceph: osd1 (1)192.168.122.1:6843 bad crc/signature libceph: bad data crc, calculated 57958023, expected 1805382778 libceph: osd2 (2)192.168.122.1:6876 integrity error, bad crc Introduce rxbounce option to enable use of a bounce buffer when receiving message data. In particular this is needed if a mapped image is a Windows VM disk, passed to QEMU. Windows has a system-wide "dummy" page that may be mapped into the destination buffer (potentially more than once into the same buffer) by the Windows Memory Manager in an effort to generate a single large I/O [1][2]. QEMU makes a point of preserving overlap relationships when cloning I/O vectors, so krbd gets exposed to this behaviour. [1] "What Is Really in That MDL?" https://docs.microsoft.com/en-us/previous-versions/windows/hardware/design/dn614012(v=vs.85) [2] https://blogs.msmvps.com/kernelmustard/2005/05/04/dummy-pages/ URL: https://bugzilla.redhat.com/show_bug.cgi?id=1973317 Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/messenger.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 6a89ea410e43..edf62eaa6285 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -35,6 +35,7 @@ #define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */ #define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */ #define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */ +#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */ #define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 6c6b6ea52bb8..e7f2fb2fc207 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -461,6 +461,7 @@ struct ceph_connection { struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ + struct page *bounce_page; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */ -- cgit v1.2.3 From bfb1a7c91fb7758273b4a8d735313d9cc388b502 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 2 Feb 2022 12:55:53 -0800 Subject: x86/bug: Merge annotate_reachable() into _BUG_FLAGS() asm In __WARN_FLAGS(), we had two asm statements (abbreviated): asm volatile("ud2"); asm volatile(".pushsection .discard.reachable"); These pair of statements are used to trigger an exception, but then help objtool understand that for warnings, control flow will be restored immediately afterwards. The problem is that volatile is not a compiler barrier. GCC explicitly documents this: > Note that the compiler can move even volatile asm instructions > relative to other code, including across jump instructions. Also, no clobbers are specified to prevent instructions from subsequent statements from being scheduled by compiler before the second asm statement. This can lead to instructions from subsequent statements being emitted by the compiler before the second asm statement. Providing a scheduling model such as via -march= options enables the compiler to better schedule instructions with known latencies to hide latencies from data hazards compared to inline asm statements in which latencies are not estimated. If an instruction gets scheduled by the compiler between the two asm statements, then objtool will think that it is not reachable, producing a warning. To prevent instructions from being scheduled in between the two asm statements, merge them. Also remove an unnecessary unreachable() asm annotation from BUG() in favor of __builtin_unreachable(). objtool is able to track that the ud2 from BUG() terminates control flow within the function. Link: https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Volatile Link: https://github.com/ClangBuiltLinux/linux/issues/1483 Signed-off-by: Nick Desaulniers Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220202205557.2260694-1-ndesaulniers@google.com --- include/linux/compiler.h | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 429dcebe2b99..0f7fd205ab7e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -117,14 +117,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define __stringify_label(n) #n -#define __annotate_reachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.reachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ -}) -#define annotate_reachable() __annotate_reachable(__COUNTER__) - #define __annotate_unreachable(c) ({ \ asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.unreachable\n\t" \ @@ -133,24 +125,21 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, }) #define annotate_unreachable() __annotate_unreachable(__COUNTER__) -#define ASM_UNREACHABLE \ - "999:\n\t" \ - ".pushsection .discard.unreachable\n\t" \ - ".long 999b - .\n\t" \ +#define ASM_REACHABLE \ + "998:\n\t" \ + ".pushsection .discard.reachable\n\t" \ + ".long 998b - .\n\t" \ ".popsection\n\t" /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(".rodata..c_jump_table") #else -#define annotate_reachable() #define annotate_unreachable() +# define ASM_REACHABLE #define __annotate_jump_table #endif -#ifndef ASM_UNREACHABLE -# define ASM_UNREACHABLE -#endif #ifndef unreachable # define unreachable() do { \ annotate_unreachable(); \ -- cgit v1.2.3 From e85c81ba8859a4c839bcd69c5d83b32954133a5b Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 17 Jan 2022 17:36:54 +0800 Subject: ext4: fast commit may not fallback for ineligible commit For the follow scenario: 1. jbd start commit transaction n 2. task A get new handle for transaction n+1 3. task A do some ineligible actions and mark FC_INELIGIBLE 4. jbd complete transaction n and clean FC_INELIGIBLE 5. task A call fsync In this case fast commit will not fallback to full commit and transaction n+1 also not handled by jbd. Make ext4_fc_mark_ineligible() also record transaction tid for latest ineligible case, when call ext4_fc_cleanup() check current transaction tid, if small than latest ineligible tid do not clear the EXT4_MF_FC_INELIGIBLE. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: Ritesh Harjani Suggested-by: Harshad Shirwadkar Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220117093655.35160-2-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fd933c45281a..d63b8106796e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1295,7 +1295,7 @@ struct journal_s * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ - void (*j_fc_cleanup_callback)(struct journal_s *journal, int); + void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: -- cgit v1.2.3 From 3ca40c0d329113a9f76f6aa01abe73d9f16ace9d Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:50 +0530 Subject: jbd2: cleanup unused functions declarations from jbd2.h During code review found no references of few of these below function declarations. This patch cleans those up from jbd2.h Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/30d1fc327becda197a4136cf9cdc73d9baa3b7b9.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d63b8106796e..afc5572e7b8a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1419,9 +1419,7 @@ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); -extern void __journal_free_buffer(struct journal_head *bh); extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); -extern void __journal_clean_data_list(transaction_t *transaction); static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) { list_add_tail(&bh->b_assoc_buffers, head); @@ -1486,9 +1484,6 @@ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct buffer_head **bh_out, sector_t blocknr); -/* Transaction locking */ -extern void __wait_on_journal (journal_t *); - /* Transaction cache support */ extern void jbd2_journal_destroy_transaction_cache(void); extern int __init jbd2_journal_init_transaction_cache(void); @@ -1774,8 +1769,6 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 -extern int jbd_blocks_per_page(struct inode *inode); - /* JBD uses a CRC32 checksum */ #define JBD_MAX_CHECKSUM_SIZE 4 -- cgit v1.2.3 From 4f98186848707f530669238d90e0562d92a78aab Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:51 +0530 Subject: jbd2: refactor wait logic for transaction updates into a common function No functionality change as such in this patch. This only refactors the common piece of code which waits for t_updates to finish into a common function named as jbd2_journal_wait_updates(journal_t *) Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/8c564f70f4b2591171677a2a74fccb22a7b6c3a4.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index afc5572e7b8a..9c3ada74ffb1 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -594,7 +594,7 @@ struct transaction_s */ unsigned long t_log_start; - /* + /* * Number of buffers on the t_buffers list [j_list_lock, no locks * needed for jbd2 thread] */ @@ -1538,6 +1538,8 @@ extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); +void jbd2_journal_wait_updates(journal_t *); + extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int bsize); -- cgit v1.2.3 From 67d6212afda218d564890d1674bab28e8612170f Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Thu, 27 Jan 2022 15:39:53 -0800 Subject: Revert "module, async: async_synchronize_full() on module init iff async is used" This reverts commit 774a1221e862b343388347bac9b318767336b20b. We need to finish all async code before the module init sequence is done. In the reverted commit the PF_USED_ASYNC flag was added to mark a thread that called async_schedule(). Then the PF_USED_ASYNC flag was used to determine whether or not async_synchronize_full() needs to be invoked. This works when modprobe thread is calling async_schedule(), but it does not work if module dispatches init code to a worker thread which then calls async_schedule(). For example, PCI driver probing is invoked from a worker thread based on a node where device is attached: if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); We end up in a situation where a worker thread gets the PF_USED_ASYNC flag set instead of the modprobe thread. As a result, async_synchronize_full() is not invoked and modprobe completes without waiting for the async code to finish. The issue was discovered while loading the pm80xx driver: (scsi_mod.scan=async) modprobe pm80xx worker ... do_init_module() ... pci_call_probe() work_on_cpu(local_pci_probe) local_pci_probe() pm8001_pci_probe() scsi_scan_host() async_schedule() worker->flags |= PF_USED_ASYNC; ... < return from worker > ... if (current->flags & PF_USED_ASYNC) <--- false async_synchronize_full(); Commit 21c3c5d28007 ("block: don't request module during elevator init") fixed the deadlock issue which the reverted commit 774a1221e862 ("module, async: async_synchronize_full() on module init iff async is used") tried to fix. Since commit 0fdff3ec6d87 ("async, kmod: warn on synchronous request_module() from async workers") synchronous module loading from async is not allowed. Given that the original deadlock issue is fixed and it is no longer allowed to call synchronous request_module() from async we can remove PF_USED_ASYNC flag to make module init consistently invoke async_synchronize_full() unless async module probe is requested. Signed-off-by: Igor Pylypiv Reviewed-by: Changyuan Lyu Reviewed-by: Luis Chamberlain Acked-by: Tejun Heo Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f5b2be39a78c..75ba8aa60248 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1680,7 +1680,6 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ -- cgit v1.2.3 From ac9f0c810684a1b161c18eb4b91ce84cbc13c91d Mon Sep 17 00:00:00 2001 From: Anton Lundin Date: Thu, 3 Feb 2022 10:41:35 +0100 Subject: ata: libata-core: Introduce ATA_HORKAGE_NO_LOG_DIR horkage 06f6c4c6c3e8 ("ata: libata: add missing ata_identify_page_supported() calls") introduced additional calls to ata_identify_page_supported(), thus also adding indirectly accesses to the device log directory log page through ata_log_supported(). Reading this log page causes SATADOM-ML 3ME devices to lock up. Introduce the horkage flag ATA_HORKAGE_NO_LOG_DIR to prevent accesses to the log directory in ata_log_supported() and add a blacklist entry with this flag for "SATADOM-ML 3ME" devices. Fixes: 636f6e2af4fb ("libata: add horkage for missing Identify Device log") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Anton Lundin Signed-off-by: Damien Le Moal --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 605756f645be..7f99b4d78822 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -380,6 +380,7 @@ enum { ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ ATA_HORKAGE_NO_ID_DEV_LOG = (1 << 28), /* Identify device log missing */ + ATA_HORKAGE_NO_LOG_DIR = (1 << 29), /* Do not read log directory */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From 80110bbfbba6f0078d5a1cbc8df004506db8ffe5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:24 -0800 Subject: mm/page_table_check: check entries at pmd levels syzbot detected a case where the page table counters were not properly updated. syzkaller login: ------------[ cut here ]------------ kernel BUG at mm/page_table_check.c:162! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 3099 Comm: pasha Not tainted 5.16.0+ #48 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO4 RIP: 0010:__page_table_check_zero+0x159/0x1a0 Call Trace: free_pcp_prepare+0x3be/0xaa0 free_unref_page+0x1c/0x650 free_compound_page+0xec/0x130 free_transhuge_page+0x1be/0x260 __put_compound_page+0x90/0xd0 release_pages+0x54c/0x1060 __pagevec_release+0x7c/0x110 shmem_undo_range+0x85e/0x1250 ... The repro involved having a huge page that is split due to uprobe event temporarily replacing one of the pages in the huge page. Later the huge page was combined again, but the counters were off, as the PTE level was not properly updated. Make sure that when PMD is cleared and prior to freeing the level the PTEs are updated. Link: https://lkml.kernel.org/r/20220131203249.2832273-5-pasha.tatashin@soleen.com Fixes: df4e817b7108 ("mm: page table check") Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Wei Xu Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_table_check.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 38cace1da7b6..01e16c7696ec 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -26,6 +26,9 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud); +void __page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd); static inline void page_table_check_alloc(struct page *page, unsigned int order) { @@ -100,6 +103,16 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, __page_table_check_pud_set(mm, addr, pudp, pud); } +static inline void page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_clear_range(mm, addr, pmd); +} + #else static inline void page_table_check_alloc(struct page *page, unsigned int order) @@ -143,5 +156,11 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, { } +static inline void page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ +} + #endif /* CONFIG_PAGE_TABLE_CHECK */ #endif /* __LINUX_PAGE_TABLE_CHECK_H */ -- cgit v1.2.3 From 314c459a6fe0957b5885fbc65c53d51444092880 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 3 Feb 2022 20:49:29 -0800 Subject: mm/pgtable: define pte_index so that preprocessor could recognize it Since commit 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") pte_index is a static inline and there is no define for it that can be recognized by the preprocessor. As a result, vm_insert_pages() uses slower loop over vm_insert_page() instead of insert_pages() that amortizes the cost of spinlock operations when inserting multiple pages. Link: https://lkml.kernel.org/r/20220111145457.20748-1-rppt@kernel.org Fixes: 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") Signed-off-by: Mike Rapoport Reported-by: Christian Dietrich Reviewed-by: Khalid Aziz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pgtable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index bc8713a76e03..f4f4077b97aa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -62,6 +62,7 @@ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } +#define pte_index pte_index #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) -- cgit v1.2.3 From fda17afc6166e975bec1197bd94cd2a3317bce3f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 7 Feb 2022 11:27:53 +0900 Subject: ata: libata-core: Fix ata_dev_config_cpr() The concurrent positioning ranges log page 47h is a general purpose log page and not a subpage of the indentify device log. Using ata_identify_page_supported() to test for concurrent positioning ranges support is thus wrong. ata_log_supported() must be used. Furthermore, unlike other advanced ATA features (e.g. NCQ priority), accesses to the concurrent positioning ranges log page are not gated by a feature bit from the device IDENTIFY data. Since many older drives react badly to the READ LOG EXT and/or READ LOG DMA EXT commands isued to read device log pages, avoid problems with older drives by limiting the concurrent positioning ranges support detection to drives implementing at least the ACS-4 ATA standard (major version 11). This additional condition effectively turns ata_dev_config_cpr() into a nop for older drives, avoiding problems in the field. Fixes: fe22e1c2f705 ("libata: support concurrent positioning ranges log") BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215519 Cc: stable@vger.kernel.org Reviewed-by: Hannes Reinecke Tested-by: Abderraouf Adjal Signed-off-by: Damien Le Moal --- include/linux/ata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index 199e47e97d64..21292b5bbb55 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -324,12 +324,12 @@ enum { ATA_LOG_NCQ_NON_DATA = 0x12, ATA_LOG_NCQ_SEND_RECV = 0x13, ATA_LOG_IDENTIFY_DEVICE = 0x30, + ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device log pages: */ ATA_LOG_SECURITY = 0x06, ATA_LOG_SATA_SETTINGS = 0x08, ATA_LOG_ZONED_INFORMATION = 0x09, - ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device SATA settings log:*/ ATA_LOG_DEVSLP_OFFSET = 0x30, -- cgit v1.2.3 From cb1f65c1e1424a4b5e4a86da8aa3b8fd8459c8ec Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 4 Feb 2022 18:35:22 +0100 Subject: PM: s2idle: ACPI: Fix wakeup interrupts handling After commit e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race related to the EC GPE") wakeup interrupts occurring immediately after the one discarded by acpi_s2idle_wake() may be missed. Moreover, if the SCI triggers again immediately after the rearming in acpi_s2idle_wake(), that wakeup may be missed too. The problem is that pm_system_irq_wakeup() only calls pm_system_wakeup() when pm_wakeup_irq is 0, but that's not the case any more after the interrupt causing acpi_s2idle_wake() to run until pm_wakeup_irq is cleared by the pm_wakeup_clear() call in s2idle_loop(). However, there may be wakeup interrupts occurring in that time frame and if that happens, they will be missed. To address that issue first move the clearing of pm_wakeup_irq to the point at which it is known that the interrupt causing acpi_s2idle_wake() to tun will be discarded, before rearming the SCI for wakeup. Moreover, because that only reduces the size of the time window in which the issue may manifest itself, allow pm_system_irq_wakeup() to register two second wakeup interrupts in a row and, when discarding the first one, replace it with the second one. [Of course, this assumes that only one wakeup interrupt can be discarded in one go, but currently that is the case and I am not aware of any plans to change that.] Fixes: e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race related to the EC GPE") Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 3e8ecdebe601..300273ff40cc 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -497,14 +497,14 @@ extern void ksys_sync_helper(void); /* drivers/base/power/wakeup.c */ extern bool events_check_enabled; -extern unsigned int pm_wakeup_irq; extern suspend_state_t pm_suspend_target_state; extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); -extern void pm_wakeup_clear(bool reset); +extern void pm_wakeup_clear(unsigned int irq_number); extern void pm_system_irq_wakeup(unsigned int irq_number); +extern unsigned int pm_wakeup_irq(void); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); -- cgit v1.2.3 From c306d737691ef84305d4ed0d302c63db2932f0bb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Jan 2022 15:57:45 -0500 Subject: NFSD: Deprecate NFS_OFFSET_MAX NFS_OFFSET_MAX was introduced way back in Linux v2.3.y before there was a kernel-wide OFFSET_MAX value. As a clean up, replace the last few uses of it with its generic equivalent, and get rid of it. Signed-off-by: Chuck Lever --- include/linux/nfs.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs.h b/include/linux/nfs.h index 0dc7ad38a0da..b06375e88e58 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -36,14 +36,6 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc memcpy(target->data, source->data, source->size); } - -/* - * This is really a general kernel constant, but since nothing like - * this is defined in the kernel headers, I have to do it here. - */ -#define NFS_OFFSET_MAX ((__s64)((~(__u64)0) >> 1)) - - enum nfs3_stable_how { NFS_UNSTABLE = 0, NFS_DATA_SYNC = 1, -- cgit v1.2.3 From 0764db9b49c932b89ee4d9e3236dff4bb07b4a66 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 11 Feb 2022 16:32:32 -0800 Subject: mm: memcg: synchronize objcg lists with a dedicated spinlock Alexander reported a circular lock dependency revealed by the mmap1 ltp test: LOCKDEP_CIRCULAR (suite: ltp, case: mtest06 (mmap1)) WARNING: possible circular locking dependency detected 5.17.0-20220113.rc0.git0.f2211f194038.300.fc35.s390x+debug #1 Not tainted ------------------------------------------------------ mmap1/202299 is trying to acquire lock: 00000001892c0188 (css_set_lock){..-.}-{2:2}, at: obj_cgroup_release+0x4a/0xe0 but task is already holding lock: 00000000ca3b3818 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x38/0x180 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&sighand->siglock){-.-.}-{2:2}: __lock_acquire+0x604/0xbd8 lock_acquire.part.0+0xe2/0x238 lock_acquire+0xb0/0x200 _raw_spin_lock_irqsave+0x6a/0xd8 __lock_task_sighand+0x90/0x190 cgroup_freeze_task+0x2e/0x90 cgroup_migrate_execute+0x11c/0x608 cgroup_update_dfl_csses+0x246/0x270 cgroup_subtree_control_write+0x238/0x518 kernfs_fop_write_iter+0x13e/0x1e0 new_sync_write+0x100/0x190 vfs_write+0x22c/0x2d8 ksys_write+0x6c/0xf8 __do_syscall+0x1da/0x208 system_call+0x82/0xb0 -> #0 (css_set_lock){..-.}-{2:2}: check_prev_add+0xe0/0xed8 validate_chain+0x736/0xb20 __lock_acquire+0x604/0xbd8 lock_acquire.part.0+0xe2/0x238 lock_acquire+0xb0/0x200 _raw_spin_lock_irqsave+0x6a/0xd8 obj_cgroup_release+0x4a/0xe0 percpu_ref_put_many.constprop.0+0x150/0x168 drain_obj_stock+0x94/0xe8 refill_obj_stock+0x94/0x278 obj_cgroup_charge+0x164/0x1d8 kmem_cache_alloc+0xac/0x528 __sigqueue_alloc+0x150/0x308 __send_signal+0x260/0x550 send_signal+0x7e/0x348 force_sig_info_to_task+0x104/0x180 force_sig_fault+0x48/0x58 __do_pgm_check+0x120/0x1f0 pgm_check_handler+0x11e/0x180 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sighand->siglock); lock(css_set_lock); lock(&sighand->siglock); lock(css_set_lock); *** DEADLOCK *** 2 locks held by mmap1/202299: #0: 00000000ca3b3818 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x38/0x180 #1: 00000001892ad560 (rcu_read_lock){....}-{1:2}, at: percpu_ref_put_many.constprop.0+0x0/0x168 stack backtrace: CPU: 15 PID: 202299 Comm: mmap1 Not tainted 5.17.0-20220113.rc0.git0.f2211f194038.300.fc35.s390x+debug #1 Hardware name: IBM 3906 M04 704 (LPAR) Call Trace: dump_stack_lvl+0x76/0x98 check_noncircular+0x136/0x158 check_prev_add+0xe0/0xed8 validate_chain+0x736/0xb20 __lock_acquire+0x604/0xbd8 lock_acquire.part.0+0xe2/0x238 lock_acquire+0xb0/0x200 _raw_spin_lock_irqsave+0x6a/0xd8 obj_cgroup_release+0x4a/0xe0 percpu_ref_put_many.constprop.0+0x150/0x168 drain_obj_stock+0x94/0xe8 refill_obj_stock+0x94/0x278 obj_cgroup_charge+0x164/0x1d8 kmem_cache_alloc+0xac/0x528 __sigqueue_alloc+0x150/0x308 __send_signal+0x260/0x550 send_signal+0x7e/0x348 force_sig_info_to_task+0x104/0x180 force_sig_fault+0x48/0x58 __do_pgm_check+0x120/0x1f0 pgm_check_handler+0x11e/0x180 INFO: lockdep is turned off. In this example a slab allocation from __send_signal() caused a refilling and draining of a percpu objcg stock, resulted in a releasing of another non-related objcg. Objcg release path requires taking the css_set_lock, which is used to synchronize objcg lists. This can create a circular dependency with the sighandler lock, which is taken with the locked css_set_lock by the freezer code (to freeze a task). In general it seems that using css_set_lock to synchronize objcg lists makes any slab allocations and deallocation with the locked css_set_lock and any intervened locks risky. To fix the problem and make the code more robust let's stop using css_set_lock to synchronize objcg lists and use a new dedicated spinlock instead. Link: https://lkml.kernel.org/r/Yfm1IHmoGdyUR81T@carbon.dhcp.thefacebook.com Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API") Signed-off-by: Roman Gushchin Reported-by: Alexander Egorenkov Tested-by: Alexander Egorenkov Reviewed-by: Waiman Long Acked-by: Tejun Heo Reviewed-by: Shakeel Butt Reviewed-by: Jeremy Linton Tested-by: Jeremy Linton Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b72d75141e12..0abbd685703b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -219,7 +219,7 @@ struct obj_cgroup { struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { - struct list_head list; + struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; }; @@ -315,7 +315,8 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; struct obj_cgroup __rcu *objcg; - struct list_head objcg_list; /* list of inherited objcgs */ + /* list of inherited objcgs, protected by objcg_lock */ + struct list_head objcg_list; #endif MEMCG_PADDING(_pad2_); -- cgit v1.2.3 From 8913c61001482378d4ed8cc577b17c1ba3e847e4 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 11 Feb 2022 16:32:35 -0800 Subject: kfence: make test case compatible with run time set sample interval The parameter kfence_sample_interval can be set via boot parameter and late shell command, which is convenient for automated tests and KFENCE parameter optimization. However, KFENCE test case just uses compile-time CONFIG_KFENCE_SAMPLE_INTERVAL, which will make KFENCE test case not run as users desired. Export kfence_sample_interval, so that KFENCE test case can use run-time-set sample interval. Link: https://lkml.kernel.org/r/20220207034432.185532-1-liupeng256@huawei.com Signed-off-by: Peng Liu Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Sumit Semwal Cc: Christian Knig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 4b5e3679a72c..f49e64222628 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -17,6 +17,8 @@ #include #include +extern unsigned long kfence_sample_interval; + /* * We allocate an even number of pages, as it simplifies calculations to map * address to metadata indices; effectively, the very first page serves as an -- cgit v1.2.3