From 6ef6a0e821d3dad6bf8a5d5508762dba9042c84b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 3 Oct 2024 08:09:16 -0700 Subject: iomap: share iomap_unshare_iter predicate code with fsdax The predicate code that iomap_unshare_iter uses to decide if it's really needs to unshare a file range mapping should be shared with the fsdax version, because right now they're opencoded and inconsistent. Note that we simplify the predicate logic a bit -- we no longer allow unsharing of inline data mappings, but there aren't any filesystems that allow shared inline data currently. This is a fix in the sense that it should have been ported to fsdax. Fixes: b53fdb215d13 ("iomap: improve shared block detection in iomap_unshare_iter") Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/172796813294.1131942.15762084021076932620.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4ad12a3c8bae..d8a7fc84348c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -267,6 +267,7 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); +bool iomap_want_unshare_iter(const struct iomap_iter *iter); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, -- cgit v1.2.3 From 1442ee0011983f0c5c4b92380e6853afb513841a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Oct 2024 21:49:59 +0100 Subject: irqchip/gic-v4: Don't allow a VMOVP on a dying VPE Kunkun Jiang reported that there is a small window of opportunity for userspace to force a change of affinity for a VPE while the VPE has already been unmapped, but the corresponding doorbell interrupt still visible in /proc/irq/. Plug the race by checking the value of vmapp_count, which tracks whether the VPE is mapped ot not, and returning an error in this case. This involves making vmapp_count common to both GICv4.1 and its v4.0 ancestor. Fixes: 64edfaa9a234 ("irqchip/gic-v4.1: Implement the v4.1 flavour of VMAPP") Reported-by: Kunkun Jiang Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/c182ece6-2ba0-ce4f-3404-dba7a3ab6c52@huawei.com Link: https://lore.kernel.org/all/20241002204959.2051709-1-maz@kernel.org --- include/linux/irqchip/arm-gic-v4.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index ecabed6d3307..7f1f11a5e4e4 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -66,10 +66,12 @@ struct its_vpe { bool enabled; bool group; } sgi_config[16]; - atomic_t vmapp_count; }; }; + /* Track the VPE being mapped */ + atomic_t vmapp_count; + /* * Ensures mutual exclusion between affinity setting of the * vPE and vLPI operations using vpe->col_idx. -- cgit v1.2.3 From 04b670de2859a8a8b0830779f9c9bda5d39662ab Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 7 Oct 2024 16:54:11 -0400 Subject: closures: Add closure_wait_event_timeout() Add a closure version of wait_event_timeout(), with the same semantics. The closure version is useful because unlike wait_event(), it allows blocking code to run in the conditional expression. Cc: Coly Li Signed-off-by: Kent Overstreet --- include/linux/closure.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/closure.h b/include/linux/closure.h index 2af44427107d..880fe85e35e9 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -454,4 +454,39 @@ do { \ __closure_wait_event(waitlist, _cond); \ } while (0) +#define __closure_wait_event_timeout(waitlist, _cond, _until) \ +({ \ + struct closure cl; \ + long _t; \ + \ + closure_init_stack(&cl); \ + \ + while (1) { \ + closure_wait(waitlist, &cl); \ + if (_cond) { \ + _t = max_t(long, 1L, _until - jiffies); \ + break; \ + } \ + _t = max_t(long, 0L, _until - jiffies); \ + if (!_t) \ + break; \ + closure_sync_timeout(&cl, _t); \ + } \ + closure_wake_up(waitlist); \ + closure_sync(&cl); \ + _t; \ +}) + +/* + * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if + * condition became true + */ +#define closure_wait_event_timeout(waitlist, _cond, _timeout) \ +({ \ + unsigned long _until = jiffies + _timeout; \ + (_cond) \ + ? max_t(long, 1L, _until - jiffies) \ + : __closure_wait_event_timeout(waitlist, _cond, _until);\ +}) + #endif /* _LINUX_CLOSURE_H */ -- cgit v1.2.3 From c657243ae12000dc57e3648b0ddd30da9ffd1f14 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2024 16:51:07 +0200 Subject: serial: qcom-geni: fix rx cancel dma status bit Cancelling an rx command is signalled using bit 14 of the rx DMA status register and not bit 11. This bit is currently unused, but this error becomes apparent, for example, when tracing the status register when closing the port. Fixes: eddac5af0654 ("soc: qcom: Add GENI based QUP Wrapper driver") Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20241009145110.16847-7-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/soc/qcom/geni-se.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/geni-se.h b/include/linux/soc/qcom/geni-se.h index c3bca9c0bf2c..2996a3c28ef3 100644 --- a/include/linux/soc/qcom/geni-se.h +++ b/include/linux/soc/qcom/geni-se.h @@ -258,8 +258,8 @@ struct geni_se { #define RX_DMA_PARITY_ERR BIT(5) #define RX_DMA_BREAK GENMASK(8, 7) #define RX_GENI_GP_IRQ GENMASK(10, 5) -#define RX_GENI_CANCEL_IRQ BIT(11) #define RX_GENI_GP_IRQ_EXT GENMASK(13, 12) +#define RX_GENI_CANCEL_IRQ BIT(14) /* SE_HW_PARAM_0 fields */ #define TX_FIFO_WIDTH_MSK GENMASK(29, 24) -- cgit v1.2.3 From 73ab05aa46b02d96509cb029a8d04fca7bbde8c7 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 9 Oct 2024 21:44:32 -0400 Subject: sched/core: Disable page allocation in task_tick_mm_cid() With KASAN and PREEMPT_RT enabled, calling task_work_add() in task_tick_mm_cid() may cause the following splat. [ 63.696416] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 [ 63.696416] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 610, name: modprobe [ 63.696416] preempt_count: 10001, expected: 0 [ 63.696416] RCU nest depth: 1, expected: 1 This problem is caused by the following call trace. sched_tick() [ acquire rq->__lock ] -> task_tick_mm_cid() -> task_work_add() -> __kasan_record_aux_stack() -> kasan_save_stack() -> stack_depot_save_flags() -> alloc_pages_mpol_noprof() -> __alloc_pages_noprof() -> get_page_from_freelist() -> rmqueue() -> rmqueue_pcplist() -> __rmqueue_pcplist() -> rmqueue_bulk() -> rt_spin_lock() The rq lock is a raw_spinlock_t. We can't sleep while holding it. IOW, we can't call alloc_pages() in stack_depot_save_flags(). The task_tick_mm_cid() function with its task_work_add() call was introduced by commit 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") in v6.4 kernel. Fortunately, there is a kasan_record_aux_stack_noalloc() variant that calls stack_depot_save_flags() while not allowing it to allocate new pages. To allow task_tick_mm_cid() to use task_work without page allocation, a new TWAF_NO_ALLOC flag is added to enable calling kasan_record_aux_stack_noalloc() instead of kasan_record_aux_stack() if set. The task_tick_mm_cid() function is modified to add this new flag. The possible downside is the missing stack trace in a KASAN report due to new page allocation required when task_work_add_noallloc() is called which should be rare. Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241010014432.194742-1-longman@redhat.com --- include/linux/task_work.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/task_work.h b/include/linux/task_work.h index cf5e7e891a77..2964171856e0 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -14,11 +14,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func) } enum task_work_notify_mode { - TWA_NONE, + TWA_NONE = 0, TWA_RESUME, TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, + + TWA_FLAGS = 0xff00, + TWAF_NO_ALLOC = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) -- cgit v1.2.3 From 1d7b2ce43d2c22a21dadaf689cb36a69570346a6 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 11 Oct 2024 11:01:03 +0800 Subject: net: enetc: add missing static descriptor and inline keyword Fix the build warnings when CONFIG_FSL_ENETC_MDIO is not enabled. The detailed warnings are shown as follows. include/linux/fsl/enetc_mdio.h:62:18: warning: no previous prototype for function 'enetc_hw_alloc' [-Wmissing-prototypes] 62 | struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) | ^ include/linux/fsl/enetc_mdio.h:62:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 62 | struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) | ^ | static 8 warnings generated. Fixes: 6517798dd343 ("enetc: Make MDIO accessors more generic and export to include/linux/fsl") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410102136.jQHZOcS4-lkp@intel.com/ Signed-off-by: Wei Fang Reviewed-by: Claudiu Manoil Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241011030103.392362-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/fsl/enetc_mdio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsl/enetc_mdio.h b/include/linux/fsl/enetc_mdio.h index df25fffdc0ae..623ccfcbf39c 100644 --- a/include/linux/fsl/enetc_mdio.h +++ b/include/linux/fsl/enetc_mdio.h @@ -59,7 +59,8 @@ static inline int enetc_mdio_read_c45(struct mii_bus *bus, int phy_id, static inline int enetc_mdio_write_c45(struct mii_bus *bus, int phy_id, int devad, int regnum, u16 value) { return -EINVAL; } -struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) +static inline struct enetc_hw *enetc_hw_alloc(struct device *dev, + void __iomem *port_regs) { return ERR_PTR(-EINVAL); } #endif -- cgit v1.2.3 From cd9626e9ebc77edec33023fe95dab4b04ffc819d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Oct 2024 11:38:10 +0200 Subject: sched/fair: Fix external p->on_rq users Sean noted that ever since commit 152e11f6df29 ("sched/fair: Implement delayed dequeue") KVM's preemption notifiers have started mis-classifying preemption vs blocking. Notably p->on_rq is no longer sufficient to determine if a task is runnable or blocked -- the aforementioned commit introduces tasks that remain on the runqueue even through they will not run again, and should be considered blocked for many cases. Add the task_is_runnable() helper to classify things and audit all external users of the p->on_rq state. Also add a few comments. Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Reported-by: Sean Christopherson Tested-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20241010091843.GK33184@noisy.programming.kicks-ass.net --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e6ee4258169a..8a9517e6640c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2133,6 +2133,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ +static inline bool task_is_runnable(struct task_struct *p) +{ + return p->on_rq && !p->se.sched_delayed; +} + extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); -- cgit v1.2.3 From 71dce222d5865ccb19b231a84d26ca316a65e255 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 1 Oct 2024 15:06:11 +0800 Subject: ALSA/hda: intel-sdw-acpi: add support for sdw-manager-list property read The DisCo for SoundWire 2.0 spec adds support for a new sdw-manager-list property. Add it in backwards-compatible mode with 'sdw-master-count', which assumed that all links between 0..count-1 exist. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241001070611.63288-5-yung-chuan.liao@linux.intel.com --- include/linux/soundwire/sdw_intel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 37ae69365fe2..734dc1fa3b5b 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -227,7 +227,7 @@ struct sdw_intel_ops { /** * struct sdw_intel_acpi_info - Soundwire Intel information found in ACPI tables * @handle: ACPI controller handle - * @count: link count found with "sdw-master-count" property + * @count: link count found with "sdw-master-count" or "sdw-manager-list" property * @link_mask: bit-wise mask listing links enabled by BIOS menu * * this structure could be expanded to e.g. provide all the _ADR -- cgit v1.2.3 From c0adf8c3a9bf33f1dd1bf950601380f46a3fcec3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Oct 2024 10:59:12 +0200 Subject: iomap: factor out a iomap_last_written_block helper Split out a pice of logic from iomap_file_buffered_write_punch_delalloc that is useful for all iomap_end implementations. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- include/linux/iomap.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4ad12a3c8bae..62253739dedc 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -256,6 +256,20 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) return &i->iomap; } +/* + * Return the file offset for the first unchanged block after a short write. + * + * If nothing was written, round @pos down to point at the first block in + * the range, else round up to include the partially written block. + */ +static inline loff_t iomap_last_written_block(struct inode *inode, loff_t pos, + ssize_t written) +{ + if (unlikely(!written)) + return round_down(pos, i_blocksize(inode)); + return round_up(pos + written, i_blocksize(inode)); +} + ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, void *private); int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); -- cgit v1.2.3 From caf0ea451d97c33c5bbaa0074dad33b0b2a4e649 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Oct 2024 10:59:13 +0200 Subject: iomap: remove iomap_file_buffered_write_punch_delalloc Currently iomap_file_buffered_write_punch_delalloc can be called from XFS either with the invalidate lock held or not. To fix this while keeping the locking in the file system and not the iomap library code we'll need to life the locking up into the file system. To prepare for that, open code iomap_file_buffered_write_punch_delalloc in the only caller, and instead export iomap_write_delalloc_release. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- include/linux/iomap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 62253739dedc..d0420e962ffd 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -290,9 +290,9 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); -void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos, - loff_t length, ssize_t written, unsigned flag, - struct iomap *iomap, iomap_punch_t punch); +void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, + loff_t end_byte, unsigned flags, struct iomap *iomap, + iomap_punch_t punch); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, const struct iomap_ops *ops); -- cgit v1.2.3 From eb0c062161cf5f98556a906c48b0cfc019d9e89c Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 16 Sep 2024 15:33:20 +0200 Subject: gpu: host1x: Set up device DMA parameters In order to store device DMA parameters, the DMA framework depends on the device's dma_parms field to point at a valid memory location. Add backing storage for this in struct host1x_memory_context and point to it. Reported-by: Jonathan Hunter Reviewed-by: Christoph Hellwig Tested-by: Jon Hunter Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20240916133320.368620-1-thierry.reding@gmail.com (cherry picked from commit b4ad4ef374d66cc8df3188bb1ddb65bce5fc9e50) Signed-off-by: Thierry Reding --- include/linux/host1x.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 9c8119ed13a4..c4dde3aafcac 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -466,6 +466,7 @@ struct host1x_memory_context { refcount_t ref; struct pid *owner; + struct device_dma_parameters dma_parms; struct device dev; u64 dma_mask; u32 stream_id; -- cgit v1.2.3 From f03b296e8b516dbd63f57fc9056c1b0da1b9a0ff Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 14 Oct 2024 21:27:58 +0200 Subject: fs: pass offset and result to backing_file end_write() callback This is needed for extending fuse inode size after fuse passthrough write. Suggested-by: Miklos Szeredi Link: https://lore.kernel.org/linux-fsdevel/CAJfpegs=cvZ_NYy6Q_D42XhYS=Sjj5poM1b5TzXzOVvX=R36aA@mail.gmail.com/ Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- include/linux/backing-file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 4b61b0e57720..2eed0ffb5e8f 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -16,7 +16,7 @@ struct backing_file_ctx { const struct cred *cred; struct file *user_file; void (*accessed)(struct file *); - void (*end_write)(struct file *); + void (*end_write)(struct file *, loff_t, ssize_t); }; struct file *backing_file_open(const struct path *user_path, int flags, -- cgit v1.2.3 From 8f3ce3d996bf1e2f8474ec3ddabdb8765c19e6ea Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 7 Oct 2024 16:30:49 +0200 Subject: mm: percpu: increase PERCPU_DYNAMIC_SIZE_SHIFT on certain builds. Arnd reported a build failure due to the BUILD_BUG_ON() statement in alloc_kmem_cache_cpus(). The test PERCPU_DYNAMIC_EARLY_SIZE < NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu) The factors that increase the right side of the equation: - PAGE_SIZE > 4KiB increases KMALLOC_SHIFT_HIGH - For the local_lock_t in kmem_cache_cpu: - PREEMPT_RT adds an actual lock. - LOCKDEP increases the size of the lock. - LOCK_STAT adds additional bytes plus padding to the lockdep structure. The net difference with and without PREEMPT_RT is 88 bytes for the lock_lock_t, 96 bytes for kmem_cache_cpu due to additional padding. This is enough to exceed the 80KiB limit with 16KiB page size - the 8KiB page size is fine. Increase PERCPU_DYNAMIC_SIZE_SHIFT to 13 on configs with PAGE_SIZE larger than 4KiB and LOCKDEP enabled. Link: https://lkml.kernel.org/r/20241007143049.gyMpEu89@linutronix.de Fixes: d8fccd9ca5f9 ("arm64: Allow to enable PREEMPT_RT.") Signed-off-by: Sebastian Andrzej Siewior Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410020326.iaZIteIx-lkp@intel.com/ Reported-by: Arnd Bergmann Closes: https://lore.kernel.org/20241004095702.637528-1-arnd@kernel.org Acked-by: Arnd Bergmann Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Christoph Lameter Cc: Dennis Zhou Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Cc: Tejun Heo Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/percpu.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index b6321fc49159..52b5ea663b9f 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -41,7 +41,11 @@ PCPU_MIN_ALLOC_SHIFT) #ifdef CONFIG_RANDOM_KMALLOC_CACHES -#define PERCPU_DYNAMIC_SIZE_SHIFT 12 +# if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PAGE_SIZE_4KB) +# define PERCPU_DYNAMIC_SIZE_SHIFT 13 +# else +# define PERCPU_DYNAMIC_SIZE_SHIFT 12 +#endif /* LOCKDEP and PAGE_SIZE > 4KiB */ #else #define PERCPU_DYNAMIC_SIZE_SHIFT 10 #endif -- cgit v1.2.3 From 963756aac1f011d904ddd9548ae82286d3a91f96 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 11 Oct 2024 12:24:44 +0200 Subject: mm: huge_memory: add vma_thp_disabled() and thp_disabled_by_hw() Patch series "mm: don't install PMD mappings when THPs are disabled by the hw/process/vma". During testing, it was found that we can get PMD mappings in processes where THP (and more precisely, PMD mappings) are supposed to be disabled. While it works as expected for anon+shmem, the pagecache is the problematic bit. For s390 KVM this currently means that a VM backed by a file located on filesystem with large folio support can crash when KVM tries accessing the problematic page, because the readahead logic might decide to use a PMD-sized THP and faulting it into the page tables will install a PMD mapping, something that s390 KVM cannot tolerate. This might also be a problem with HW that does not support PMD mappings, but I did not try reproducing it. Fix it by respecting the ways to disable THPs when deciding whether we can install a PMD mapping. khugepaged should already be taking care of not collapsing if THPs are effectively disabled for the hw/process/vma. This patch (of 2): Add vma_thp_disabled() and thp_disabled_by_hw() helpers to be shared by shmem_allowable_huge_orders() and __thp_vma_allowable_orders(). [david@redhat.com: rename to vma_thp_disabled(), split out thp_disabled_by_hw() ] Link: https://lkml.kernel.org/r/20241011102445.934409-2-david@redhat.com Fixes: 793917d997df ("mm/readahead: Add large folio readahead") Signed-off-by: Kefeng Wang Signed-off-by: David Hildenbrand Reported-by: Leo Fu Tested-by: Thomas Huth Reviewed-by: Ryan Roberts Cc: Boqiao Fu Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: Hugh Dickins Cc: Janosch Frank Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 67d0ab3c3bba..ef5b80e48599 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -322,6 +322,24 @@ struct thpsize { (transparent_hugepage_flags & \ (1<vm_mm->flags); +} + +static inline bool thp_disabled_by_hw(void) +{ + /* If the hardware/firmware marked hugepage support disabled. */ + return transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED); +} + unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, -- cgit v1.2.3 From 88a387cf9e5f7f7665e6dde8c6610f0ea65c5a6b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 1 Oct 2024 15:13:53 +0100 Subject: KVM: Remove unused kvm_vcpu_gfn_to_pfn The last use of kvm_vcpu_gfn_to_pfn was removed by commit b1624f99aa8f ("KVM: Remove kvm_vcpu_gfn_to_page() and kvm_vcpu_gpa_to_page()") Remove it. Signed-off-by: Dr. David Alan Gilbert Message-ID: <20241001141354.18009-2-linux@treblig.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index db567d26f7b9..b9b2e42e3fa7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1314,7 +1314,6 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); -kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); -- cgit v1.2.3 From bc07eea2f3b330127242df2e0ec2d6cd16b4f2e8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 1 Oct 2024 15:13:54 +0100 Subject: KVM: Remove unused kvm_vcpu_gfn_to_pfn_atomic The last use of kvm_vcpu_gfn_to_pfn_atomic was removed by commit 1bbc60d0c7e5 ("KVM: x86/mmu: Remove MMU auditing") Remove it. Signed-off-by: Dr. David Alan Gilbert Message-ID: <20241001141354.18009-3-linux@treblig.org> [Adjust Documentation/virt/kvm/locking.rst. - Paolo] Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b9b2e42e3fa7..45be36e5285f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1313,7 +1313,6 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); -kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); -- cgit v1.2.3 From 95ecba62e2fd201bcdcca636f5d774f1cd4f1458 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Oct 2024 19:41:18 +0000 Subject: net: fix races in netdev_tx_sent_queue()/dev_watchdog() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some workloads hit the infamous dev_watchdog() message: "NETDEV WATCHDOG: eth0 (xxxx): transmit queue XX timed out" It seems possible to hit this even for perfectly normal BQL enabled drivers: 1) Assume a TX queue was idle for more than dev->watchdog_timeo (5 seconds unless changed by the driver) 2) Assume a big packet is sent, exceeding current BQL limit. 3) Driver ndo_start_xmit() puts the packet in TX ring, and netdev_tx_sent_queue() is called. 4) QUEUE_STATE_STACK_XOFF could be set from netdev_tx_sent_queue() before txq->trans_start has been written. 5) txq->trans_start is written later, from netdev_start_xmit() if (rc == NETDEV_TX_OK) txq_trans_update(txq) dev_watchdog() running on another cpu could read the old txq->trans_start, and then see QUEUE_STATE_STACK_XOFF, because 5) did not happen yet. To solve the issue, write txq->trans_start right before one XOFF bit is set : - _QUEUE_STATE_DRV_XOFF from netif_tx_stop_queue() - __QUEUE_STATE_STACK_XOFF from netdev_tx_sent_queue() From dev_watchdog(), we have to read txq->state before txq->trans_start. Add memory barriers to enforce correct ordering. In the future, we could avoid writing over txq->trans_start for normal operations, and rename this field to txq->xoff_start_time. Fixes: bec251bc8b6a ("net: no longer stop all TX queues in dev_watchdog()") Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241015194118.3951657-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d20c776a4ff..8896705ccd63 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3325,6 +3325,12 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev) static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { + /* Paired with READ_ONCE() from dev_watchdog() */ + WRITE_ONCE(dev_queue->trans_start, jiffies); + + /* This barrier is paired with smp_mb() from dev_watchdog() */ + smp_mb__before_atomic(); + /* Must be an atomic op see netif_txq_try_stop() */ set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } @@ -3451,6 +3457,12 @@ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, if (likely(dql_avail(&dev_queue->dql) >= 0)) return; + /* Paired with READ_ONCE() from dev_watchdog() */ + WRITE_ONCE(dev_queue->trans_start, jiffies); + + /* This barrier is paired with smp_mb() from dev_watchdog() */ + smp_mb__before_atomic(); + set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state); /* -- cgit v1.2.3 From 4021e685139d567b3fc862f54101ae9dbb15d8b5 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 9 Oct 2024 11:31:50 +0800 Subject: fs/super.c: introduce get_tree_bdev_flags() As Allison reported [1], currently get_tree_bdev() will store "Can't lookup blockdev" error message. Although it makes sense for pure bdev-based fses, this message may mislead users who try to use EROFS file-backed mounts since get_tree_nodev() is used as a fallback then. Add get_tree_bdev_flags() to specify extensible flags [2] and GET_TREE_BDEV_QUIET_LOOKUP to silence "Can't lookup blockdev" message since it's misleading to EROFS file-backed mounts now. [1] https://lore.kernel.org/r/CAOYeF9VQ8jKVmpy5Zy9DNhO6xmWSKMB-DO8yvBB0XvBE7=3Ugg@mail.gmail.com [2] https://lore.kernel.org/r/ZwUkJEtwIpUA4qMz@infradead.org Suggested-by: Christoph Hellwig Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241009033151.2334888-1-hsiangkao@linux.alibaba.com Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs_context.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index c13e99cbbf81..4b4bfef6f053 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -160,6 +160,12 @@ extern int get_tree_keyed(struct fs_context *fc, int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc); + +#define GET_TREE_BDEV_QUIET_LOOKUP 0x0001 +int get_tree_bdev_flags(struct fs_context *fc, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc), unsigned int flags); + extern int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)); -- cgit v1.2.3 From d2f8671045b41871053dedaf3035a06ad53d2736 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 21 Oct 2024 22:11:19 +0800 Subject: LoongArch: Set initial pte entry with PAGE_GLOBAL for kernel space There are two pages in one TLB entry on LoongArch system. For kernel space, it requires both two pte entries (buddies) with PAGE_GLOBAL bit set, otherwise HW treats it as non-global tlb, there will be potential problems if tlb entry for kernel space is not global. Such as fail to flush kernel tlb with the function local_flush_tlb_kernel_range() which supposed only flush tlb with global bit. Kernel address space areas include percpu, vmalloc, vmemmap, fixmap and kasan areas. For these areas both two consecutive page table entries should be enabled with PAGE_GLOBAL bit. So with function set_pte() and pte_clear(), pte buddy entry is checked and set besides its own pte entry. However it is not atomic operation to set both two pte entries, there is problem with test_vmalloc test case. So function kernel_pte_init() is added to init a pte table when it is created for kernel address space, and the default initial pte value is PAGE_GLOBAL rather than zero at beginning. Then only its own pte entry need update with function set_pte() and pte_clear(), nothing to do with the pte buddy entry. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ecf63d2b0582..61fff5d34ed5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3818,8 +3818,9 @@ void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); -void pmd_init(void *addr); void pud_init(void *addr); +void pmd_init(void *addr); +void kernel_pte_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); -- cgit v1.2.3 From 6db388585e486c0261aeef55f8bc63a9b45756c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Oct 2024 06:13:50 +0200 Subject: iomap: turn iomap_want_unshare_iter into an inline function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit iomap_want_unshare_iter currently sits in fs/iomap/buffered-io.c, which depends on CONFIG_BLOCK. It is also in used in fs/dax.c whіch has no such dependency. Given that it is a trivial check turn it into an inline in include/linux/iomap.h to fix the DAX && !BLOCK build. Fixes: 6ef6a0e821d3 ("iomap: share iomap_unshare_iter predicate code with fsdax") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241015041350.118403-1-hch@lst.de Reviewed-by: Brian Foster Signed-off-by: Christian Brauner --- include/linux/iomap.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d8a7fc84348c..0198f36e521e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -256,6 +256,25 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) return &i->iomap; } +/* + * Check if the range needs to be unshared for a FALLOC_FL_UNSHARE_RANGE + * operation. + * + * Don't bother with blocks that are not shared to start with; or mappings that + * cannot be shared, such as inline data, delalloc reservations, holes or + * unwritten extents. + * + * Note that we use srcmap directly instead of iomap_iter_srcmap as unsharing + * requires providing a separate source map, and the presence of one is a good + * indicator that unsharing is needed, unlike IOMAP_F_SHARED which can be set + * for any data that goes into the COW fork for XFS. + */ +static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) +{ + return (iter->iomap.flags & IOMAP_F_SHARED) && + iter->srcmap.type == IOMAP_MAPPED; +} + ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, void *private); int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); @@ -267,7 +286,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); -bool iomap_want_unshare_iter(const struct iomap_iter *iter); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, -- cgit v1.2.3 From 6fad274f06f038c29660aa53fbad14241c9fd976 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 21 Oct 2024 17:28:05 +0200 Subject: bpf: Add MEM_WRITE attribute Add a MEM_WRITE attribute for BPF helper functions which can be used in bpf_func_proto to annotate an argument type in order to let the verifier know that the helper writes into the memory passed as an argument. In the past MEM_UNINIT has been (ab)used for this function, but the latter merely tells the verifier that the passed memory can be uninitialized. There have been bugs with overloading the latter but aside from that there are also cases where the passed memory is read + written which currently cannot be expressed, see also 4b3786a6c539 ("bpf: Zero former ARG_PTR_TO_{LONG,INT} args in case of error"). Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241021152809.33343-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 19d8ca8ac960..bdadb0bb6cec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -635,6 +635,7 @@ enum bpf_type_flag { */ PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), + /* MEM can be uninitialized. */ MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), /* DYNPTR points to memory local to the bpf program. */ @@ -700,6 +701,13 @@ enum bpf_type_flag { */ MEM_ALIGNED = BIT(17 + BPF_BASE_TYPE_BITS), + /* MEM is being written to, often combined with MEM_UNINIT. Non-presence + * of MEM_WRITE means that MEM is only being read. MEM_WRITE without the + * MEM_UNINIT means that memory needs to be initialized since it is also + * read. + */ + MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -758,10 +766,10 @@ enum bpf_arg_type { ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, - /* pointer to memory does not need to be initialized, helper function must fill - * all bytes or clear them in error case. + /* Pointer to memory does not need to be initialized, since helper function + * fills all bytes or clears them in error case. */ - ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | ARG_PTR_TO_MEM, + ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | MEM_WRITE | ARG_PTR_TO_MEM, /* Pointer to valid memory of size known at compile time. */ ARG_PTR_TO_FIXED_SIZE_MEM = MEM_FIXED_SIZE | ARG_PTR_TO_MEM, -- cgit v1.2.3 From 894b00a3350c560990638bdf89bdf1f3d5491950 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 21 Oct 2024 14:00:10 +0200 Subject: kasan: Fix Software Tag-Based KASAN with GCC Per [1], -fsanitize=kernel-hwaddress with GCC currently does not disable instrumentation in functions with __attribute__((no_sanitize_address)). However, __attribute__((no_sanitize("hwaddress"))) does correctly disable instrumentation. Use it instead. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117196 [1] Link: https://lore.kernel.org/r/000000000000f362e80620e27859@google.com Link: https://lore.kernel.org/r/ZvFGwKfoC4yVjN_X@J2N7QTR9R3 Link: https://bugzilla.kernel.org/show_bug.cgi?id=218854 Reported-by: syzbot+908886656a02769af987@syzkaller.appspotmail.com Tested-by: Andrey Konovalov Cc: Andrew Pinski Cc: Mark Rutland Cc: Will Deacon Signed-off-by: Marco Elver Reviewed-by: Andrey Konovalov Fixes: 7b861a53e46b ("kasan: Bump required compiler version") Link: https://lore.kernel.org/r/20241021120013.3209481-1-elver@google.com Signed-off-by: Will Deacon --- include/linux/compiler-gcc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index f805adaa316e..cd6f9aae311f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -80,7 +80,11 @@ #define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif +#ifdef __SANITIZE_HWADDRESS__ +#define __no_sanitize_address __attribute__((__no_sanitize__("hwaddress"))) +#else #define __no_sanitize_address __attribute__((__no_sanitize_address__)) +#endif #if defined(__SANITIZE_THREAD__) #define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) -- cgit v1.2.3 From c2f803052bc7a7feb2e03befccc8e49b6ff1f5f5 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 24 Oct 2024 09:35:57 +0800 Subject: bpf: Add the missing BPF_LINK_TYPE invocation for sockmap There is an out-of-bounds read in bpf_link_show_fdinfo() for the sockmap link fd. Fix it by adding the missing BPF_LINK_TYPE invocation for sockmap link Also add comments for bpf_link_type to prevent missing updates in the future. Fixes: 699c23f02c65 ("bpf: Add bpf_link support for sk_msg and sk_skb progs") Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241024013558.1135167-2-houtao@huaweicloud.com --- include/linux/bpf_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9f2a6b83b49e..fa78f49d4a9a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -146,6 +146,7 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) BPF_LINK_TYPE(BPF_LINK_TYPE_NETFILTER, netfilter) BPF_LINK_TYPE(BPF_LINK_TYPE_TCX, tcx) BPF_LINK_TYPE(BPF_LINK_TYPE_NETKIT, netkit) +BPF_LINK_TYPE(BPF_LINK_TYPE_SOCKMAP, sockmap) #endif #ifdef CONFIG_PERF_EVENTS BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) -- cgit v1.2.3 From 101c268bd2f37e965a5468353e62d154db38838e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 22 Oct 2024 18:43:49 -0700 Subject: cxl/port: Fix use-after-free, permit out-of-order decoder shutdown In support of investigating an initialization failure report [1], cxl_test was updated to register mock memory-devices after the mock root-port/bus device had been registered. That led to cxl_test crashing with a use-after-free bug with the following signature: cxl_port_attach_region: cxl region3: cxl_host_bridge.0:port3 decoder3.0 add: mem0:decoder7.0 @ 0 next: cxl_switch_uport.0 nr_eps: 1 nr_targets: 1 cxl_port_attach_region: cxl region3: cxl_host_bridge.0:port3 decoder3.0 add: mem4:decoder14.0 @ 1 next: cxl_switch_uport.0 nr_eps: 2 nr_targets: 1 cxl_port_setup_targets: cxl region3: cxl_switch_uport.0:port6 target[0] = cxl_switch_dport.0 for mem0:decoder7.0 @ 0 1) cxl_port_setup_targets: cxl region3: cxl_switch_uport.0:port6 target[1] = cxl_switch_dport.4 for mem4:decoder14.0 @ 1 [..] cxld_unregister: cxl decoder14.0: cxl_region_decode_reset: cxl_region region3: mock_decoder_reset: cxl_port port3: decoder3.0 reset 2) mock_decoder_reset: cxl_port port3: decoder3.0: out of order reset, expected decoder3.1 cxl_endpoint_decoder_release: cxl decoder14.0: [..] cxld_unregister: cxl decoder7.0: 3) cxl_region_decode_reset: cxl_region region3: Oops: general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6bc3: 0000 [#1] PREEMPT SMP PTI [..] RIP: 0010:to_cxl_port+0x8/0x60 [cxl_core] [..] Call Trace: cxl_region_decode_reset+0x69/0x190 [cxl_core] cxl_region_detach+0xe8/0x210 [cxl_core] cxl_decoder_kill_region+0x27/0x40 [cxl_core] cxld_unregister+0x5d/0x60 [cxl_core] At 1) a region has been established with 2 endpoint decoders (7.0 and 14.0). Those endpoints share a common switch-decoder in the topology (3.0). At teardown, 2), decoder14.0 is the first to be removed and hits the "out of order reset case" in the switch decoder. The effect though is that region3 cleanup is aborted leaving it in-tact and referencing decoder14.0. At 3) the second attempt to teardown region3 trips over the stale decoder14.0 object which has long since been deleted. The fix here is to recognize that the CXL specification places no mandate on in-order shutdown of switch-decoders, the driver enforces in-order allocation, and hardware enforces in-order commit. So, rather than fail and leave objects dangling, always remove them. In support of making cxl_region_decode_reset() always succeed, cxl_region_invalidate_memregion() failures are turned into warnings. Crashing the kernel is ok there since system integrity is at risk if caches cannot be managed around physical address mutation events like CXL region destruction. A new device_for_each_child_reverse_from() is added to cleanup port->commit_end after all dependent decoders have been disabled. In other words if decoders are allocated 0->1->2 and disabled 1->2->0 then port->commit_end only decrements from 2 after 2 has been disabled, and it decrements all the way to zero since 1 was disabled previously. Link: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net [1] Cc: stable@vger.kernel.org Fixes: 176baefb2eb5 ("cxl/hdm: Commit decoder state to hardware") Reviewed-by: Jonathan Cameron Cc: Greg Kroah-Hartman Cc: Davidlohr Bueso Cc: Dave Jiang Cc: Alison Schofield Cc: Ira Weiny Cc: Zijun Hu Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Link: https://patch.msgid.link/172964782781.81806.17902885593105284330.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Ira Weiny --- include/linux/device.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index b4bde8d22697..667cb6db9019 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1078,6 +1078,9 @@ int device_for_each_child(struct device *dev, void *data, int (*fn)(struct device *dev, void *data)); int device_for_each_child_reverse(struct device *dev, void *data, int (*fn)(struct device *dev, void *data)); +int device_for_each_child_reverse_from(struct device *parent, + struct device *from, const void *data, + int (*fn)(struct device *, const void *)); struct device *device_find_child(struct device *dev, void *data, int (*match)(struct device *dev, void *data)); struct device *device_find_child_by_name(struct device *parent, -- cgit v1.2.3 From b5413156bad91dc2995a5c4eab1b05e56914638a Mon Sep 17 00:00:00 2001 From: Benjamin Segall Date: Fri, 25 Oct 2024 18:35:35 -0700 Subject: posix-cpu-timers: Clear TICK_DEP_BIT_POSIX_TIMER on clone When cloning a new thread, its posix_cputimers are not inherited, and are cleared by posix_cputimers_init(). However, this does not clear the tick dependency it creates in tsk->tick_dep_mask, and the handler does not reach the code to clear the dependency if there were no timers to begin with. Thus if a thread has a cputimer running before clone/fork, all descendants will prevent nohz_full unless they create a cputimer of their own. Fix this by entirely clearing the tick_dep_mask in copy_process(). (There is currently no inherited state that needs a tick dependency) Process-wide timers do not have this problem because fork does not copy signal_struct as a baseline, it creates one from scratch. Fixes: b78783000d5c ("posix-cpu-timers: Migrate to use new tick dependency mask model") Signed-off-by: Ben Segall Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/xm26o737bq8o.fsf@google.com --- include/linux/tick.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 72744638c5b0..99c9c5a7252a 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -251,12 +251,19 @@ static inline void tick_dep_set_task(struct task_struct *tsk, if (tick_nohz_full_enabled()) tick_nohz_dep_set_task(tsk, bit); } + static inline void tick_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (tick_nohz_full_enabled()) tick_nohz_dep_clear_task(tsk, bit); } + +static inline void tick_dep_init_task(struct task_struct *tsk) +{ + atomic_set(&tsk->tick_dep_mask, 0); +} + static inline void tick_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { @@ -290,6 +297,7 @@ static inline void tick_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { } static inline void tick_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { } +static inline void tick_dep_init_task(struct task_struct *tsk) { } static inline void tick_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { } static inline void tick_dep_clear_signal(struct signal_struct *signal, -- cgit v1.2.3 From f64e67e5d3a45a4a04286c47afade4b518acd47b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 15 Oct 2024 18:56:05 +0100 Subject: fork: do not invoke uffd on fork if error occurs Patch series "fork: do not expose incomplete mm on fork". During fork we may place the virtual memory address space into an inconsistent state before the fork operation is complete. In addition, we may encounter an error during the fork operation that indicates that the virtual memory address space is invalidated. As a result, we should not be exposing it in any way to external machinery that might interact with the mm or VMAs, machinery that is not designed to deal with incomplete state. We specifically update the fork logic to defer khugepaged and ksm to the end of the operation and only to be invoked if no error arose, and disallow uffd from observing fork events should an error have occurred. This patch (of 2): Currently on fork we expose the virtual address space of a process to userland unconditionally if uffd is registered in VMAs, regardless of whether an error arose in the fork. This is performed in dup_userfaultfd_complete() which is invoked unconditionally, and performs two duties - invoking registered handlers for the UFFD_EVENT_FORK event via dup_fctx(), and clearing down userfaultfd_fork_ctx objects established in dup_userfaultfd(). This is problematic, because the virtual address space may not yet be correctly initialised if an error arose. The change in commit d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") makes this more pertinent as we may be in a state where entries in the maple tree are not yet consistent. We address this by, on fork error, ensuring that we roll back state that we would otherwise expect to clean up through the event being handled by userland and perform the memory freeing duty otherwise performed by dup_userfaultfd_complete(). We do this by implementing a new function, dup_userfaultfd_fail(), which performs the same loop, only decrementing reference counts. Note that we perform mmgrab() on the parent and child mm's, however userfaultfd_ctx_put() will mmdrop() this once the reference count drops to zero, so we will avoid memory leaks correctly here. Link: https://lkml.kernel.org/r/cover.1729014377.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/d3691d58bb58712b6fb3df2be441d175bd3cdf07.1729014377.git.lorenzo.stoakes@oracle.com Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") Signed-off-by: Lorenzo Stoakes Reported-by: Jann Horn Reviewed-by: Jann Horn Reviewed-by: Liam R. Howlett Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Linus Torvalds Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9fc6ce15c499..cb40f1a1d081 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -249,6 +249,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); +void dup_userfaultfd_fail(struct list_head *); extern void mremap_userfaultfd_prep(struct vm_area_struct *, struct vm_userfaultfd_ctx *); @@ -351,6 +352,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l) { } +static inline void dup_userfaultfd_fail(struct list_head *l) +{ +} + static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *ctx) { -- cgit v1.2.3 From 985da552a98e27096444508ce5d853244019111f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 15 Oct 2024 18:56:06 +0100 Subject: fork: only invoke khugepaged, ksm hooks if no error There is no reason to invoke these hooks early against an mm that is in an incomplete state. The change in commit d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") makes this more pertinent as we may be in a state where entries in the maple tree are not yet consistent. Their placement early in dup_mmap() only appears to have been meaningful for early error checking, and since functionally it'd require a very small allocation to fail (in practice 'too small to fail') that'd only occur in the most dire circumstances, meaning the fork would fail or be OOM'd in any case. Since both khugepaged and KSM tracking are there to provide optimisations to memory performance rather than critical functionality, it doesn't really matter all that much if, under such dire memory pressure, we fail to register an mm with these. As a result, we follow the example of commit d2081b2bf819 ("mm: khugepaged: make khugepaged_enter() void function") and make ksm_fork() a void function also. We only expose the mm to these functions once we are done with them and only if no error occurred in the fork operation. Link: https://lkml.kernel.org/r/e0cb8b840c9d1d5a6e84d4f8eff5f3f2022aa10c.1729014377.git.lorenzo.stoakes@oracle.com Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") Signed-off-by: Lorenzo Stoakes Reported-by: Jann Horn Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Jann Horn Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Linus Torvalds Cc: Signed-off-by: Andrew Morton --- include/linux/ksm.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 11690dacd986..ec9c05044d4f 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -54,12 +54,11 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) return atomic_long_read(&mm->ksm_zero_pages); } -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { + /* Adding mm to ksm is best effort on fork. */ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) - return __ksm_enter(mm); - - return 0; + __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) @@ -107,9 +106,8 @@ static inline int ksm_disable(struct mm_struct *mm) return 0; } -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - return 0; } static inline int ksm_execve(struct mm_struct *mm) -- cgit v1.2.3 From 62a898b07b83f6f407003d8a70f0827a5af08a59 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 30 Oct 2024 18:05:13 +0800 Subject: bpf: Add bpf_mem_alloc_check_size() helper Introduce bpf_mem_alloc_check_size() to check whether the allocation size exceeds the limitation for the kmalloc-equivalent allocator. The upper limit for percpu allocation is LLIST_NODE_SZ bytes larger than non-percpu allocation, so a percpu argument is added to the helper. The helper will be used in the following patch to check whether the size parameter passed to bpf_mem_alloc() is too big. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241030100516.3633640-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index aaf004d94322..e45162ef59bb 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -33,6 +33,9 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); +/* Check the allocation size for kmalloc equivalent allocator */ +int bpf_mem_alloc_check_size(bool percpu, size_t size); + /* kmalloc/kfree equivalent: */ void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); -- cgit v1.2.3 From 0fc810ae3ae110f9e2fcccce80fc8c8d62f97907 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Oct 2024 16:03:31 -1000 Subject: x86/uaccess: Avoid barrier_nospec() in 64-bit copy_from_user() The barrier_nospec() in 64-bit copy_from_user() is slow. Instead use pointer masking to force the user pointer to all 1's for an invalid address. The kernel test robot reports a 2.6% improvement in the per_thread_ops benchmark [1]. This is a variation on a patch originally by Josh Poimboeuf [2]. Link: https://lore.kernel.org/202410281344.d02c72a2-oliver.sang@intel.com [1] Link: https://lore.kernel.org/5b887fe4c580214900e21f6c61095adf9a142735.1730166635.git.jpoimboe@kernel.org [2] Tested-and-reviewed-by: Josh Poimboeuf Cc: Kirill A. Shutemov Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 39c7cf82b0c2..43844510d5d0 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -38,6 +38,7 @@ #else #define can_do_masked_user_access() 0 #define masked_user_access_begin(src) NULL + #define mask_user_address(src) (src) #endif /* @@ -159,19 +160,27 @@ _inline_copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); - if (!should_fail_usercopy() && likely(access_ok(from, n))) { + if (should_fail_usercopy()) + goto fail; + if (can_do_masked_user_access()) + from = mask_user_address(from); + else { + if (!access_ok(from, n)) + goto fail; /* * Ensure that bad access_ok() speculation will not * lead to nasty side effects *after* the copy is * finished: */ barrier_nospec(); - instrument_copy_from_user_before(to, from, n); - res = raw_copy_from_user(to, from, n); - instrument_copy_from_user_after(to, from, n, res); } - if (unlikely(res)) - memset(to + (n - res), 0, res); + instrument_copy_from_user_before(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + if (likely(!res)) + return 0; +fail: + memset(to + (n - res), 0, res); return res; } extern __must_check unsigned long -- cgit v1.2.3 From f4657e16e767105194f97586fe3c03d3f64c4d37 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Sun, 20 Oct 2024 15:08:19 +0800 Subject: mm/codetag: fix null pointer check logic for ref and tag When we compile and load lib/slub_kunit.c,it will cause a panic. The root cause is that __kmalloc_cache_noprof was directly called instead of kmem_cache_alloc,which resulted in no alloc_tag being allocated.This caused current->alloc_tag to be null,leading to a null pointer dereference in alloc_tag_ref_set. Despite the fact that my colleague Pei Xiao will later fix the code in slub_kunit.c,we still need fix null pointer check logic for ref and tag to avoid panic caused by a null pointer dereference. Here is the log for the panic: [ 74.779373][ T2158] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020 [ 74.780130][ T2158] Mem abort info: [ 74.780406][ T2158] ESR = 0x0000000096000004 [ 74.780756][ T2158] EC = 0x25: DABT (current EL), IL = 32 bits [ 74.781225][ T2158] SET = 0, FnV = 0 [ 74.781529][ T2158] EA = 0, S1PTW = 0 [ 74.781836][ T2158] FSC = 0x04: level 0 translation fault [ 74.782288][ T2158] Data abort info: [ 74.782577][ T2158] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 74.783068][ T2158] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 74.783533][ T2158] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 74.784010][ T2158] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000105f34000 [ 74.784586][ T2158] [0000000000000020] pgd=0000000000000000, p4d=0000000000000000 [ 74.785293][ T2158] Internal error: Oops: 0000000096000004 [#1] SMP [ 74.785805][ T2158] Modules linked in: slub_kunit kunit ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute ip6table_nat ip6table_mangle 4 [ 74.790661][ T2158] CPU: 0 UID: 0 PID: 2158 Comm: kunit_try_catch Kdump: loaded Tainted: G W N 6.12.0-rc3+ #2 [ 74.791535][ T2158] Tainted: [W]=WARN, [N]=TEST [ 74.791889][ T2158] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 74.792479][ T2158] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 74.793101][ T2158] pc : alloc_tagging_slab_alloc_hook+0x120/0x270 [ 74.793607][ T2158] lr : alloc_tagging_slab_alloc_hook+0x120/0x270 [ 74.794095][ T2158] sp : ffff800084d33cd0 [ 74.794418][ T2158] x29: ffff800084d33cd0 x28: 0000000000000000 x27: 0000000000000000 [ 74.795095][ T2158] x26: 0000000000000000 x25: 0000000000000012 x24: ffff80007b30e314 [ 74.795822][ T2158] x23: ffff000390ff6f10 x22: 0000000000000000 x21: 0000000000000088 [ 74.796555][ T2158] x20: ffff000390285840 x19: fffffd7fc3ef7830 x18: ffffffffffffffff [ 74.797283][ T2158] x17: ffff8000800e63b4 x16: ffff80007b33afc4 x15: ffff800081654c00 [ 74.798011][ T2158] x14: 0000000000000000 x13: 205d383531325420 x12: 5b5d383734363537 [ 74.798744][ T2158] x11: ffff800084d337e0 x10: 000000000000005d x9 : 00000000ffffffd0 [ 74.799476][ T2158] x8 : 7f7f7f7f7f7f7f7f x7 : ffff80008219d188 x6 : c0000000ffff7fff [ 74.800206][ T2158] x5 : ffff0003fdbc9208 x4 : ffff800081edd188 x3 : 0000000000000001 [ 74.800932][ T2158] x2 : 0beaa6dee1ac5a00 x1 : 0beaa6dee1ac5a00 x0 : ffff80037c2cb000 [ 74.801656][ T2158] Call trace: [ 74.801954][ T2158] alloc_tagging_slab_alloc_hook+0x120/0x270 [ 74.802494][ T2158] __kmalloc_cache_noprof+0x148/0x33c [ 74.802976][ T2158] test_kmalloc_redzone_access+0x4c/0x104 [slub_kunit] [ 74.803607][ T2158] kunit_try_run_case+0x70/0x17c [kunit] [ 74.804124][ T2158] kunit_generic_run_threadfn_adapter+0x2c/0x4c [kunit] [ 74.804768][ T2158] kthread+0x10c/0x118 [ 74.805141][ T2158] ret_from_fork+0x10/0x20 [ 74.805540][ T2158] Code: b9400a80 11000400 b9000a80 97ffd858 (f94012d3) [ 74.806176][ T2158] SMP: stopping secondary CPUs [ 74.808130][ T2158] Starting crashdump kernel... Link: https://lkml.kernel.org/r/20241020070819.307944-1-hao.ge@linux.dev Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()") Signed-off-by: Hao Ge Acked-by: Suren Baghdasaryan Suggested-by: Suren Baghdasaryan Acked-by: Yu Zhao Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 1f0a9ff23a2c..941deffc590d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -135,18 +135,21 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {} #endif /* Caller should verify both ref and tag to be valid */ -static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +static inline bool __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { alloc_tag_add_check(ref, tag); if (!ref || !tag) - return; + return false; ref->ct = &tag->ct; + return true; } -static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { - __alloc_tag_ref_set(ref, tag); + if (unlikely(!__alloc_tag_ref_set(ref, tag))) + return false; + /* * We need in increment the call counter every time we have a new * allocation or when we split a large allocation into smaller ones. @@ -154,12 +157,13 @@ static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *t * counter because when we free each part the counter will be decremented. */ this_cpu_inc(tag->counters->calls); + return true; } static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) { - alloc_tag_ref_set(ref, tag); - this_cpu_add(tag->counters->bytes, bytes); + if (likely(alloc_tag_ref_set(ref, tag))) + this_cpu_add(tag->counters->bytes, bytes); } static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) -- cgit v1.2.3 From 5168a68eb78fa1c67a8b2d31d0642c7fd866cc12 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 23 Oct 2024 01:55:12 +0800 Subject: mm, swap: avoid over reclaim of full clusters When running low on usable slots, cluster allocator will try to reclaim the full clusters aggressively to reclaim HAS_CACHE slots. This guarantees that as long as there are any usable slots, HAS_CACHE or not, the swap device will be usable and workload won't go OOM early. Before the cluster allocator, swap allocator fails easily if device is filled up with reclaimable HAS_CACHE slots. Which can be easily reproduced with following simple program: #include #include #include #include #define SIZE 8192UL * 1024UL * 1024UL int main(int argc, char **argv) { long tmp; char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memset(p, 0, SIZE); madvise(p, SIZE, MADV_PAGEOUT); for (unsigned long i = 0; i < SIZE; ++i) tmp += p[i]; getchar(); /* Pause */ return 0; } Setup an 8G non ramdisk swap, the first run of the program will swapout 8G ram successfully. But run same program again after the first run paused, the second run can't swapout all 8G memory as now half of the swap device is pinned by HAS_CACHE. There was a random scan in the old allocator that may reclaim part of the HAS_CACHE by luck, but it's unreliable. The new allocator's added reclaim of full clusters when device is low on usable slots. But when multiple CPUs are seeing the device is low on usable slots at the same time, they ran into a thundering herd problem. This is an observable problem on large machine with mass parallel workload, as full cluster reclaim is slower on large swap device and higher number of CPUs will also make things worse. Testing using a 128G ZRAM on a 48c96t system. When the swap device is very close to full (eg. 124G / 128G), running build linux kernel with make -j96 in a 1G memory cgroup will hung (not a softlockup though) spinning in full cluster reclaim for about ~5min before go OOM. To solve this, split the full reclaim into two parts: - Instead of do a synchronous aggressively reclaim when device is low, do only one aggressively reclaim when device is strictly full with a kworker. This still ensures in worst case the device won't be unusable because of HAS_CACHE slots. - To avoid allocation (especially higher order) suffer from HAS_CACHE filling up clusters and kworker not responsive enough, do one synchronous scan every time the free list is drained, and only scan one cluster. This is kind of similar to the random reclaim before, keeps the full clusters rotated and has a minimal latency. This should provide a fair reclaim strategy suitable for most workloads. Link: https://lkml.kernel.org/r/20241022175512.10398-1-ryncsn@gmail.com Fixes: 2cacbdfdee65 ("mm: swap: add a adaptive full cluster cache reclaim") Signed-off-by: Kairui Song Cc: Barry Song Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kalesh Singh Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index ca533b478c21..f3e0ac20c2e8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -335,6 +335,7 @@ struct swap_info_struct { * list. */ struct work_struct discard_work; /* discard worker */ + struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one -- cgit v1.2.3 From 9d08ec41a0645283d79a2e642205d488feaceacf Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 19 Oct 2024 22:22:12 -0600 Subject: mm: allow set/clear page_type again Some page flags (page->flags) were converted to page types (page->page_types). A recent example is PG_hugetlb. From the exclusive writer's perspective, e.g., a thread doing __folio_set_hugetlb(), there is a difference between the page flag and type APIs: the former allows the same non-atomic operation to be repeated whereas the latter does not. For example, calling __folio_set_hugetlb() twice triggers VM_BUG_ON_FOLIO(), since the second call expects the type (PG_hugetlb) not to be set previously. Using add_hugetlb_folio() as an example, it calls __folio_set_hugetlb() in the following error-handling path. And when that happens, it triggers the aforementioned VM_BUG_ON_FOLIO(). if (folio_test_hugetlb(folio)) { rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, false); ... It is possible to make hugeTLB comply with the new requirements from the page type API. However, a straightforward fix would be to just allow the same page type to be set or cleared again inside the API, to avoid any changes to its callers. Link: https://lkml.kernel.org/r/20241020042212.296781-1-yuzhao@google.com Fixes: d99e3140a4d3 ("mm: turn folio_test_hugetlb into a PageType") Signed-off-by: Yu Zhao Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1b3a76710487..cc839e4365c1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -975,12 +975,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ + if (folio_test_##fname(folio)) \ + return; \ VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ folio); \ folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ + if (folio->page.page_type == UINT_MAX) \ + return; \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ folio->page.page_type = UINT_MAX; \ } @@ -993,11 +997,15 @@ static __always_inline int Page##uname(const struct page *page) \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ + if (Page##uname(page)) \ + return; \ VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ + if (page->page_type == UINT_MAX) \ + return; \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type = UINT_MAX; \ } -- cgit v1.2.3 From 071b24b54d2d05fbf39ddbb27dee08abd1d713f3 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 27 Oct 2024 22:31:15 -0700 Subject: Input: fix regression when re-registering input handlers Commit d469647bafd9 ("Input: simplify event handling logic") introduced code that would set handler->events() method to either input_handler_events_filter() or input_handler_events_default() or input_handler_events_null(), depending on the kind of input handler (a filter or a regular one) we are dealing with. Unfortunately this breaks cases when we try to re-register the same filter (as is the case with sysrq handler): after initial registration the handler will have 2 event handling methods defined, and will run afoul of the check in input_handler_check_methods(): input: input_handler_check_methods: only one event processing method can be defined (sysrq) sysrq: Failed to register input handler, error -22 Fix this by adding handle_events() method to input_handle structure and setting it up when registering a new input handle according to event handling methods defined in associated input_handler structure, thus avoiding modifying the input_handler structure. Reported-by: "Ned T. Crigler" Reported-by: Christian Heusel Tested-by: "Ned T. Crigler" Tested-by: Peter Seiderer Fixes: d469647bafd9 ("Input: simplify event handling logic") Link: https://lore.kernel.org/r/Zx2iQp6csn42PJA7@xavtug Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- include/linux/input.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index 89a0be6ee0e2..cd866b020a01 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -339,12 +339,16 @@ struct input_handler { * @name: name given to the handle by handler that created it * @dev: input device the handle is attached to * @handler: handler that works with the device through this handle + * @handle_events: event sequence handler. It is set up by the input core + * according to event handling method specified in the @handler. See + * input_handle_setup_event_handler(). + * This method is being called by the input core with interrupts disabled + * and dev->event_lock spinlock held and so it may not sleep. * @d_node: used to put the handle on device's list of attached handles * @h_node: used to put the handle on handler's list of handles from which * it gets events */ struct input_handle { - void *private; int open; @@ -353,6 +357,10 @@ struct input_handle { struct input_dev *dev; struct input_handler *handler; + unsigned int (*handle_events)(struct input_handle *handle, + struct input_value *vals, + unsigned int count); + struct list_head d_node; struct list_head h_node; }; -- cgit v1.2.3 From ddd6d8e975b171ea3f63a011a75820883ff0d479 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 19 Oct 2024 01:29:38 +0000 Subject: mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL stats Patch series "mm: multi-gen LRU: Have secondary MMUs participate in MM_WALK". Today, the MM_WALK capability causes MGLRU to clear the young bit from PMDs and PTEs during the page table walk before eviction, but MGLRU does not call the clear_young() MMU notifier in this case. By not calling this notifier, the MM walk takes less time/CPU, but it causes pages that are accessed mostly through KVM / secondary MMUs to appear younger than they should be. We do call the clear_young() notifier today, but only when attempting to evict the page, so we end up clearing young/accessed information less frequently for secondary MMUs than for mm PTEs, and therefore they appear younger and are less likely to be evicted. Therefore, memory that is *not* being accessed mostly by KVM will be evicted *more* frequently, worsening performance. ChromeOS observed a tab-open latency regression when enabling MGLRU with a setup that involved running a VM: Tab-open latency histogram (ms) Version p50 mean p95 p99 max base 1315 1198 2347 3454 10319 mglru 2559 1311 7399 12060 43758 fix 1119 926 2470 4211 6947 This series replaces the final non-selftest patchs from this series[1], which introduced a similar change (and a new MMU notifier) with KVM optimizations. I'll send a separate series (to Sean and Paolo) for the KVM optimizations. This series also makes proactive reclaim with MGLRU possible for KVM memory. I have verified that this functions correctly with the selftest from [1], but given that that test is a KVM selftest, I'll send it with the rest of the KVM optimizations later. Andrew, let me know if you'd like to take the test now anyway. [1]: https://lore.kernel.org/linux-mm/20240926013506.860253-18-jthoughton@google.com/ This patch (of 2): The removed stats, MM_LEAF_OLD and MM_NONLEAF_TOTAL, are not very helpful and become more complicated to properly compute when adding test/clear_young() notifiers in MGLRU's mm walk. Link: https://lkml.kernel.org/r/20241019012940.3656292-1-jthoughton@google.com Link: https://lkml.kernel.org/r/20241019012940.3656292-2-jthoughton@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao Signed-off-by: James Houghton Cc: Axel Rasmussen Cc: David Matlack Cc: David Rientjes Cc: David Stevens Cc: Oliver Upton Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Wei Xu Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 17506e4a2835..9342e5692dab 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -458,9 +458,7 @@ struct lru_gen_folio { enum { MM_LEAF_TOTAL, /* total leaf entries */ - MM_LEAF_OLD, /* old leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ - MM_NONLEAF_TOTAL, /* total non-leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS -- cgit v1.2.3 From 1d4832becdc2cdb2cffe2a6050c9d9fd8ff1c58c Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 19 Oct 2024 01:29:39 +0000 Subject: mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify() When the MM_WALK capability is enabled, memory that is mostly accessed by a VM appears younger than it really is, therefore this memory will be less likely to be evicted. Therefore, the presence of a running VM can significantly increase swap-outs for non-VM memory, regressing the performance for the rest of the system. Fix this regression by always calling {ptep,pmdp}_clear_young_notify() whenever we clear the young bits on PMDs/PTEs. [jthoughton@google.com: fix link-time error] Link: https://lkml.kernel.org/r/20241019012940.3656292-3-jthoughton@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao Signed-off-by: James Houghton Reported-by: David Stevens Cc: Axel Rasmussen Cc: David Matlack Cc: David Rientjes Cc: Oliver Upton Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Wei Xu Cc: Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9342e5692dab..5b1c984daf45 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -555,7 +555,7 @@ struct lru_gen_memcg { void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); @@ -574,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } -static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { + return false; } static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) -- cgit v1.2.3