From b777b5e09eabeefc6ba80f4296366a4742701103 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Feb 2026 17:02:25 +0000 Subject: time/jiffies: Inline jiffies_to_msecs() and jiffies_to_usecs() For common cases (HZ=100, 250 or 1000), these helpers are at most one multiply, so there is no point calling a tiny function. Keep them out of line for HZ=300 and others. This saves cycles in TCP fast path, among other things. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/8 grow/shrink: 25/89 up/down: 530/-3474 (-2944) ... nla_put_msecs 193 - -193 message_stats_print 2131 920 -1211 Total: Before=25365208, After=25362264, chg -0.01% Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260210170226.57209-1-edumazet@google.com --- include/linux/jiffies.h | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fdef2c155c27..d1c3d4941854 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -434,8 +434,44 @@ extern unsigned long preset_lpj; /* * Convert various time units to each other: */ -extern unsigned int jiffies_to_msecs(const unsigned long j); -extern unsigned int jiffies_to_usecs(const unsigned long j); + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) +/** + * jiffies_to_msecs - Convert jiffies to milliseconds + * @j: jiffies value + * + * This inline version takes care of HZ in {100,250,1000}. + * + * Return: milliseconds value + */ +static inline unsigned int jiffies_to_msecs(const unsigned long j) +{ + return (MSEC_PER_SEC / HZ) * j; +} +#else +unsigned int jiffies_to_msecs(const unsigned long j); +#endif + +#if !(USEC_PER_SEC % HZ) +/** + * jiffies_to_usecs - Convert jiffies to microseconds + * @j: jiffies value + * + * Return: microseconds value + */ +static inline unsigned int jiffies_to_usecs(const unsigned long j) +{ + /* + * Hz usually doesn't go much further MSEC_PER_SEC. + * jiffies_to_usecs() and usecs_to_jiffies() depend on that. + */ + BUILD_BUG_ON(HZ > USEC_PER_SEC); + + return (USEC_PER_SEC / HZ) * j; +} +#else +unsigned int jiffies_to_usecs(const unsigned long j); +#endif /** * jiffies_to_nsecs - Convert jiffies to nanoseconds -- cgit v1.2.3 From ce9e40a9a5e5cff0b1b0d2fa582b3d71a8ce68e8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 6 Feb 2026 15:48:16 +0000 Subject: irqchip/gic-v3-its: Limit number of per-device MSIs to the range the ITS supports The ITS driver blindly assumes that EventIDs are in abundant supply, to the point where it never checks how many the hardware actually supports. It turns out that some pretty esoteric integrations make it so that only a few bits are available, all the way down to a single bit. Enforce the advertised limitation at the point of allocating the device structure, and hope that the endpoint driver can deal with such limitation. Fixes: 84a6a2e7fc18d ("irqchip: GICv3: ITS: device allocation and configuration") Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Reviewed-by: Robin Murphy Reviewed-by: Zenghui Yu Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260206154816.3582887-1-maz@kernel.org --- include/linux/irqchip/arm-gic-v3.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 70c0948f978e..0225121f3013 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -394,6 +394,7 @@ #define GITS_TYPER_VLPIS (1UL << 1) #define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 #define GITS_TYPER_ITT_ENTRY_SIZE GENMASK_ULL(7, 4) +#define GITS_TYPER_IDBITS GENMASK_ULL(12, 8) #define GITS_TYPER_IDBITS_SHIFT 8 #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS GENMASK_ULL(17, 13) -- cgit v1.2.3 From 249013e673fce3506c61063c7cbedd75b4c668d8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 22:09:21 -0800 Subject: fsnotify: drop unused helper Remove this helper now that all users have been converted to fserror_report_metadata as of 7.0-rc1. Cc: jack@suse.cz Cc: amir73il@gmail.com Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/177148129543.716249.980530449513340111.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 28a9cb13fbfa..079c18bcdbde 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -495,19 +495,6 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) fsnotify_dentry(dentry, mask); } -static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, - int error) -{ - struct fs_error_report report = { - .error = error, - .inode = inode, - .sb = sb, - }; - - return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, - NULL, NULL, NULL, 0); -} - static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) { fsnotify_mnt(FS_MNT_ATTACH, ns, mnt); -- cgit v1.2.3 From 6e3c0a4e1ad1e0455b7880fad02b3ee179f56c09 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Apr 2025 12:16:28 +0200 Subject: sched/fair: Fix lag clamp Vincent reported that he was seeing undue lag clamping in a mixed slice workload. Implement the max_slice tracking as per the todo comment. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Reported-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Vincent Guittot Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20250422101628.GA33555@noisy.programming.kicks-ass.net --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 074ad4ef3d81..a7b4a980eb2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -579,6 +579,7 @@ struct sched_entity { u64 deadline; u64 min_vruntime; u64 min_slice; + u64 max_slice; struct list_head group_node; unsigned char on_rq; -- cgit v1.2.3 From 4c652a47722f69c6f2685f05b17490ea97f643a8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Feb 2026 08:41:13 +0100 Subject: rseq: Mark rseq_arm_slice_extension_timer() __always_inline objtool warns about this function being called inside of a uaccess section: kernel/entry/common.o: warning: objtool: irqentry_exit+0x1dc: call to rseq_arm_slice_extension_timer() with UACCESS enabled Interestingly, this happens with CONFIG_RSEQ_SLICE_EXTENSION disabled, so this is an empty function, as the normal implementation is already marked __always_inline. I could reproduce this multiple times with gcc-11 but not with gcc-15, so the compiler probably got better at identifying the trivial function. Mark all the empty helpers for !RSEQ_SLICE_EXTENSION as __always_inline for consistency, avoiding this warning. Fixes: 0ac3b5c3dc45 ("rseq: Implement time slice extension enforcement timer") Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260206074122.709580-1-arnd@kernel.org --- include/linux/rseq_entry.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index cbc4a791618b..c6831c93cd6e 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -216,10 +216,10 @@ efault: } #else /* CONFIG_RSEQ_SLICE_EXTENSION */ -static inline bool rseq_slice_extension_enabled(void) { return false; } -static inline bool rseq_arm_slice_extension_timer(void) { return false; } -static inline void rseq_slice_clear_grant(struct task_struct *t) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static __always_inline bool rseq_slice_extension_enabled(void) { return false; } +static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } +static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } +static __always_inline bool rseq_grant_slice_extension(bool work_pending) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); -- cgit v1.2.3 From 3b68df978133ac3d46d570af065a73debbb68248 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 20 Feb 2026 15:06:41 -0500 Subject: rseq: slice ext: Ensure rseq feature size differs from original rseq size Before rseq became extensible, its original size was 32 bytes even though the active rseq area was only 20 bytes. This had the following impact in terms of userspace ecosystem evolution: * The GNU libc between 2.35 and 2.39 expose a __rseq_size symbol set to 32, even though the size of the active rseq area is really 20. * The GNU libc 2.40 changes this __rseq_size to 20, thus making it express the active rseq area. * Starting from glibc 2.41, __rseq_size corresponds to the AT_RSEQ_FEATURE_SIZE from getauxval(3). This means that users of __rseq_size can always expect it to correspond to the active rseq area, except for the value 32, for which the active rseq area is 20 bytes. Exposing a 32 bytes feature size would make life needlessly painful for userspace. Therefore, add a reserved field at the end of the rseq area to bump the feature size to 33 bytes. This reserved field is expected to be replaced with whatever field will come next, expecting that this field will be larger than 1 byte. The effect of this change is to increase the size from 32 to 64 bytes before we actually have fields using that memory. Clarify the allocation size and alignment requirements in the struct rseq uapi comment. Change the value returned by getauxval(AT_RSEQ_ALIGN) to return the value of the active rseq area size rounded up to next power of 2, which guarantees that the rseq structure will always be aligned on the nearest power of two large enough to contain it, even as it grows. Change the alignment check in the rseq registration accordingly. This will minimize the amount of ABI corner-cases we need to document and require userspace to play games with. The rule stays simple when __rseq_size != 32: #define rseq_field_available(field) (__rseq_size >= offsetofend(struct rseq_abi, field)) Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260220200642.1317826-3-mathieu.desnoyers@efficios.com --- include/linux/rseq.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7a01a0760405..b9d62fc2140d 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -146,6 +146,18 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) t->rseq = current->rseq; } +/* + * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq + * registration. This is the active rseq area size rounded up to next + * power of 2, which guarantees that the rseq structure will always be + * aligned on the nearest power of two large enough to contain it, even + * as it grows. + */ +static inline unsigned int rseq_alloc_align(void) +{ + return 1U << get_count_order(offsetof(struct rseq, end)); +} + #else /* CONFIG_RSEQ */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } -- cgit v1.2.3 From 901084c51a0a8fb42a3f37d2e9c62083c495f824 Mon Sep 17 00:00:00 2001 From: Penghe Geng Date: Thu, 19 Feb 2026 15:29:54 -0500 Subject: mmc: core: Avoid bitfield RMW for claim/retune flags Move claimed and retune control flags out of the bitfield word to avoid unrelated RMW side effects in asynchronous contexts. The host->claimed bit shared a word with retune flags. Writes to claimed in __mmc_claim_host() or retune_now in mmc_mq_queue_rq() can overwrite other bits when concurrent updates happen in other contexts, triggering spurious WARN_ON(!host->claimed). Convert claimed, can_retune, retune_now and retune_paused to bool to remove shared-word coupling. Fixes: 6c0cedd1ef952 ("mmc: core: Introduce host claiming by context") Fixes: 1e8e55b67030c ("mmc: block: Add CQE support") Cc: stable@vger.kernel.org Suggested-by: Adrian Hunter Signed-off-by: Penghe Geng Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e0e2c265e5d1..ba84f02c2a10 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -486,14 +486,12 @@ struct mmc_host { struct mmc_ios ios; /* current io bus settings */ + bool claimed; /* host exclusively claimed */ + /* group bitfields together to minimize padding */ unsigned int use_spi_crc:1; - unsigned int claimed:1; /* host exclusively claimed */ unsigned int doing_init_tune:1; /* initial tuning in progress */ - unsigned int can_retune:1; /* re-tuning can be used */ unsigned int doing_retune:1; /* re-tuning in progress */ - unsigned int retune_now:1; /* do re-tuning at next req */ - unsigned int retune_paused:1; /* re-tuning is temporarily disabled */ unsigned int retune_crc_disable:1; /* don't trigger retune upon crc */ unsigned int can_dma_map_merge:1; /* merging can be used */ unsigned int vqmmc_enabled:1; /* vqmmc regulator is enabled */ @@ -508,6 +506,9 @@ struct mmc_host { int rescan_disable; /* disable card detection */ int rescan_entered; /* used with nonremovable devices */ + bool can_retune; /* re-tuning can be used */ + bool retune_now; /* do re-tuning at next req */ + bool retune_paused; /* re-tuning is temporarily disabled */ int need_retune; /* re-tuning is needed */ int hold_retune; /* hold off re-tuning */ unsigned int retune_period; /* re-tuning period in secs */ -- cgit v1.2.3 From 3afd8df024339c7da1a5a0302f3987866dd16e40 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 22 Dec 2025 21:36:25 +0100 Subject: PM: runtime: Change pm_runtime_put() return type to void The primary role of pm_runtime_put() is to decrement the runtime PM usage counter of the given device. It always does that regardless of the value returned by it later. In addition, if the runtime PM usage counter after decrementation turns out to be zero, a work item is queued up to check whether or not the device can be suspended. This is not guaranteed to succeed though and even if it is successful, the device may still not be suspended going forward. There are multiple valid reasons why pm_runtime_put() may not decide to queue up the work item mentioned above, including, but not limited to, the case when user space has written "on" to the device's runtime PM "control" file in sysfs. In all of those cases, pm_runtime_put() returns a negative error code (even though the device's runtime PM usage counter has been successfully decremented by it) which is very confusing. In fact, its return value should only be used for debug purposes and care should be taken when doing it even in that case. Accordingly, to avoid the confusion mentioned above, change the return type of pm_runtime_put() to void. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Reviewed-by: Brian Norris Link: https://patch.msgid.link/14387202.RDIVbhacDa@rafael.j.wysocki --- include/linux/pm_runtime.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 41037c513f06..64921b10ac74 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -545,22 +545,10 @@ static inline int pm_runtime_resume_and_get(struct device *dev) * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_idle(). - * - * Return: - * * 1: Success. Usage counter dropped to zero, but device was already suspended. - * * 0: Success. - * * -EINVAL: Runtime PM error. - * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status - * change ongoing. - * * -EBUSY: Runtime PM child_count non-zero. - * * -EPERM: Device PM QoS resume latency 0. - * * -EINPROGRESS: Suspend already in progress. - * * -ENOSYS: CONFIG_PM not enabled. */ -static inline int pm_runtime_put(struct device *dev) +static inline void pm_runtime_put(struct device *dev) { - return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); + __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } /** -- cgit v1.2.3 From 551d44200152cb26f75d2ef990aeb6185b7e37fd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 23 Feb 2026 09:33:08 -0800 Subject: default_gfp(): avoid using the "newfangled" __VA_OPT__ trick The default_gfp() helper that I added is not wrong, but it turns out that it causes unnecessary headaches for 'sparse' which doesn't support the use of __VA_OPT__ (introduced in C++20 and C23, and supported by gcc and clang for a long time). We do already use __VA_OPT__ in some other cases in the kernel (drm/xe and btrfs), but it has been fairly limited. Now it triggers for pretty much everything, and sparse ends up not working at all. We can use the traditional gcc ',##__VA_ARGS__' syntax instead: it may not be the "C standard" way and is slightly less natural in this context, but it is the traditional model for this and avoids the sparse problem. Reported-and-tested-by: Ricardo Ribalda Reported-and-tested-by: Richard Fitzgerald Reported-by: Ben Dooks Fixes: e19e1b480ac7 ("add default_gfp() helper macro and use it in the new *alloc_obj() helpers") Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2b30a0529d48..90536b2bc42e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -14,8 +14,8 @@ struct vm_area_struct; struct mempolicy; /* Helper macro to avoid gfp flags if they are the default one */ -#define __default_gfp(a,...) a -#define default_gfp(...) __default_gfp(__VA_ARGS__ __VA_OPT__(,) GFP_KERNEL) +#define __default_gfp(a,b,...) b +#define default_gfp(...) __default_gfp(,##__VA_ARGS__,GFP_KERNEL) /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) -- cgit v1.2.3 From eb9549346f7578eda3755683ac2cfb4d94c0675f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Feb 2026 13:17:44 +0100 Subject: mm: change vma_alloc_folio_noprof() macro to inline function In a few rare configurations with extra warnings eanbled, the new drm_pagemap_migrate_populate_ram_pfn() calls vma_alloc_folio_noprof() but that does not use all the arguments, leading to a harmless warning: drivers/gpu/drm/drm_pagemap.c: In function 'drm_pagemap_migrate_populate_ram_pfn': drivers/gpu/drm/drm_pagemap.c:701:63: error: parameter 'addr' set but not used [-Werror=unused-but-set-parameter=] 701 | unsigned long addr) | ~~~~~~~~~~~~~~^~~~ Replace the macro with an inline function so the compiler can see how the argument would be used, but is still able to optimize out the assignments. Link: https://lkml.kernel.org/r/20260216121751.2378374-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Lorenzo Stoakes Acked-by: Zi Yan Reviewed-by: Suren Baghdasaryan Cc: Alexei Starovoitov Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/gfp.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2b30a0529d48..f82d74a77cad 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -339,8 +339,11 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde { return folio_alloc_noprof(gfp, order); } -#define vma_alloc_folio_noprof(gfp, order, vma, addr) \ - folio_alloc_noprof(gfp, order) +static inline struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, + struct vm_area_struct *vma, unsigned long addr) +{ + return folio_alloc_noprof(gfp, order); +} #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -- cgit v1.2.3 From f85b1c6af5bc3872f994df0a5688c1162de07a62 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 16 Feb 2026 14:22:19 +0100 Subject: liveupdate: luo_file: remember retrieve() status LUO keeps track of successful retrieve attempts on a LUO file. It does so to avoid multiple retrievals of the same file. Multiple retrievals cause problems because once the file is retrieved, the serialized data structures are likely freed and the file is likely in a very different state from what the code expects. The retrieve boolean in struct luo_file keeps track of this, and is passed to the finish callback so it knows what work was already done and what it has left to do. All this works well when retrieve succeeds. When it fails, luo_retrieve_file() returns the error immediately, without ever storing anywhere that a retrieve was attempted or what its error code was. This results in an errored LIVEUPDATE_SESSION_RETRIEVE_FD ioctl to userspace, but nothing prevents it from trying this again. The retry is problematic for much of the same reasons listed above. The file is likely in a very different state than what the retrieve logic normally expects, and it might even have freed some serialization data structures. Attempting to access them or free them again is going to break things. For example, if memfd managed to restore 8 of its 10 folios, but fails on the 9th, a subsequent retrieve attempt will try to call kho_restore_folio() on the first folio again, and that will fail with a warning since it is an invalid operation. Apart from the retry, finish() also breaks. Since on failure the retrieved bool in luo_file is never touched, the finish() call on session close will tell the file handler that retrieve was never attempted, and it will try to access or free the data structures that might not exist, much in the same way as the retry attempt. There is no sane way of attempting the retrieve again. Remember the error retrieve returned and directly return it on a retry. Also pass this status code to finish() so it can make the right decision on the work it needs to do. This is done by changing the bool to an integer. A value of 0 means retrieve was never attempted, a positive value means it succeeded, and a negative value means it failed and the error code is the value. Link: https://lkml.kernel.org/r/20260216132221.987987-1-pratyush@kernel.org Fixes: 7c722a7f44e0 ("liveupdate: luo_file: implement file systems callbacks") Signed-off-by: Pratyush Yadav (Google) Reviewed-by: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index fe82a6c3005f..dd11fdc76a5f 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -23,8 +23,11 @@ struct file; /** * struct liveupdate_file_op_args - Arguments for file operation callbacks. * @handler: The file handler being called. - * @retrieved: The retrieve status for the 'can_finish / finish' - * operation. + * @retrieve_status: The retrieve status for the 'can_finish / finish' + * operation. A value of 0 means the retrieve has not been + * attempted, a positive value means the retrieve was + * successful, and a negative value means the retrieve failed, + * and the value is the error code of the call. * @file: The file object. For retrieve: [OUT] The callback sets * this to the new file. For other ops: [IN] The caller sets * this to the file being operated on. @@ -40,7 +43,7 @@ struct file; */ struct liveupdate_file_op_args { struct liveupdate_file_handler *handler; - bool retrieved; + int retrieve_status; struct file *file; u64 serialized_data; void *private_data; -- cgit v1.2.3 From 4b44cbb264d0ed3f2f2bc2659db6ce45882f4670 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Feb 2026 15:24:52 -0800 Subject: overflow: Make sure size helpers are always inlined With kmalloc_obj() performing implicit size calculations, the embedded size_mul() calls, while marked inline, were not always being inlined. I noticed a couple places where allocations were making a call out for things that would otherwise be compile-time calculated. Force the compilers to always inline these calculations. Reviewed-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20260224232451.work.614-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/overflow.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index eddd987a8513..a8cb6319b4fb 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -42,7 +42,7 @@ * both the type-agnostic benefits of the macros while also being able to * enforce that the return value is, in fact, checked. */ -static inline bool __must_check __must_check_overflow(bool overflow) +static __always_inline bool __must_check __must_check_overflow(bool overflow) { return unlikely(overflow); } @@ -327,7 +327,7 @@ static inline bool __must_check __must_check_overflow(bool overflow) * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ -static inline size_t __must_check size_mul(size_t factor1, size_t factor2) +static __always_inline size_t __must_check size_mul(size_t factor1, size_t factor2) { size_t bytes; @@ -346,7 +346,7 @@ static inline size_t __must_check size_mul(size_t factor1, size_t factor2) * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ -static inline size_t __must_check size_add(size_t addend1, size_t addend2) +static __always_inline size_t __must_check size_add(size_t addend1, size_t addend2) { size_t bytes; @@ -367,7 +367,7 @@ static inline size_t __must_check size_add(size_t addend1, size_t addend2) * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX). * The lvalue must be size_t to avoid implicit type conversion. */ -static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) +static __always_inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) { size_t bytes; -- cgit v1.2.3 From f3ec502b6755a3bfb12c1c47025ef989ff9efc72 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 25 Feb 2026 08:34:07 -0800 Subject: mm/slab: mark alloc tags empty for sheaves allocated with __GFP_NO_OBJ_EXT alloc_empty_sheaf() allocates sheaves from SLAB_KMALLOC caches using __GFP_NO_OBJ_EXT to avoid recursion, however it does not mark their allocation tags empty before freeing, which results in a warning when CONFIG_MEM_ALLOC_PROFILING_DEBUG is set. Fix this by marking allocation tags for such sheaves as empty. The problem was technically introduced in commit 4c0a17e28340 but only becomes possible to hit with commit 913ffd3a1bf5. Fixes: 4c0a17e28340 ("slab: prevent recursive kmalloc() in alloc_empty_sheaf()") Fixes: 913ffd3a1bf5 ("slab: handle kmalloc sheaves bootstrap") Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/all/20260223155128.3849-1-00107082@163.com/ Analyzed-by: Harry Yoo Signed-off-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Tested-by: Harry Yoo Tested-by: David Wang <00107082@163.com> Link: https://patch.msgid.link/20260225163407.2218712-1-surenb@google.com Signed-off-by: Vlastimil Babka (SUSE) --- include/linux/gfp_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 814bb2892f99..6c75df30a281 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -139,6 +139,8 @@ enum { * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. * * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension. + * mark_obj_codetag_empty() should be called upon freeing for objects allocated + * with this flag to indicate that their NULL tags are expected and normal. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) -- cgit v1.2.3 From 2b351ea42820a7ecc2e8305724536512984f4419 Mon Sep 17 00:00:00 2001 From: Sanjay Chitroda Date: Thu, 26 Feb 2026 11:17:12 +0530 Subject: mm/slub: drop duplicate kernel-doc for ksize() The implementation of ksize() was updated with kernel-doc by commit fab0694646d7 ("mm/slab: move [__]ksize and slab_ksize() to mm/slub.c") However, the public header still contains a kernel-doc comment attached to the ksize() prototype. Having documentation both in the header and next to the implementation causes Sphinx to treat the function as being documented twice, resulting in the warning: WARNING: Duplicate C declaration, also defined at core-api/mm-api:521 Declaration is '.. c:function:: size_t ksize(const void *objp)' Kernel-doc guidelines recommend keeping the documentation with the function implementation. Therefore remove the redundant kernel-doc block from include/linux/slab.h so that the implementation in slub.c remains the canonical source for documentation. No functional change. Fixes: fab0694646d7 ("mm/slab: move [__]ksize and slab_ksize() to mm/slub.c") Signed-off-by: Sanjay Chitroda Link: https://patch.msgid.link/20260226054712.3610744-1-sanjayembedded@gmail.com Signed-off-by: Vlastimil Babka (SUSE) --- include/linux/slab.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index a5a5e4108ae5..15a60b501b95 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -517,18 +517,6 @@ void kfree_sensitive(const void *objp); DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T)) DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T)) -/** - * ksize - Report actual allocation size of associated object - * - * @objp: Pointer returned from a prior kmalloc()-family allocation. - * - * This should not be used for writing beyond the originally requested - * allocation size. Either use krealloc() or round up the allocation size - * with kmalloc_size_roundup() prior to allocation. If this is used to - * access beyond the originally requested allocation size, UBSAN_BOUNDS - * and/or FORTIFY_SOURCE may trip, since they only know about the - * originally allocated size via the __alloc_size attribute. - */ size_t ksize(const void *objp); #ifdef CONFIG_PRINTK -- cgit v1.2.3 From 1df97a7453eec80c1912c2d0360290a3970a7671 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:01 -0800 Subject: bpf: Register dtor for freeing special fields There is a race window where BPF hash map elements can leak special fields if the program with access to the map value recreates these special fields between the check_and_free_fields done on the map value and its eventual return to the memory allocator. Several ways were explored prior to this patch, most notably [0] tried to use a poison value to reject attempts to recreate special fields for map values that have been logically deleted but still accessible to BPF programs (either while sitting in the free list or when reused). While this approach works well for task work, timers, wq, etc., it is harder to apply the idea to kptrs, which have a similar race and failure mode. Instead, we change bpf_mem_alloc to allow registering destructor for allocated elements, such that when they are returned to the allocator, any special fields created while they were accessible to programs in the mean time will be freed. If these values get reused, we do not free the fields again before handing the element back. The special fields thus may remain initialized while the map value sits in a free list. When bpf_mem_alloc is retired in the future, a similar concept can be introduced to kmalloc_nolock-backed kmem_cache, paired with the existing idea of a constructor. Note that the destructor registration happens in map_check_btf, after the BTF record is populated and (at that point) avaiable for inspection and duplication. Duplication is necessary since the freeing of embedded bpf_mem_alloc can be decoupled from actual map lifetime due to logic introduced to reduce the cost of rcu_barrier()s in mem alloc free path in 9f2c6e96c65e ("bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc."). As such, once all callbacks are done, we must also free the duplicated record. To remove dependency on the bpf_map itself, also stash the key size of the map to obtain value from htab_elem long after the map is gone. [0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr") Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Reported-by: Alexei Starovoitov Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index e45162ef59bb..4ce0d27f8ea2 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -14,6 +14,8 @@ struct bpf_mem_alloc { struct obj_cgroup *objcg; bool percpu; struct work_struct work; + void (*dtor_ctx_free)(void *ctx); + void *dtor_ctx; }; /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. @@ -32,6 +34,10 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg /* The percpu allocation with a specific unit size. */ int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); +void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, + void (*dtor)(void *obj, void *ctx), + void (*dtor_ctx_free)(void *ctx), + void *ctx); /* Check the allocation size for kmalloc equivalent allocator */ int bpf_mem_alloc_check_size(bool percpu, size_t size); -- cgit v1.2.3 From ae51772b1e94ba1d76db19085957dbccac189c1c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:02 -0800 Subject: bpf: Lose const-ness of map in map_check_btf() BPF hash map may now use the map_check_btf() callback to decide whether to set a dtor on its bpf_mem_alloc or not. Unlike C++ where members can opt out of const-ness using mutable, we must lose the const qualifier on the callback such that we can avoid the ugly cast. Make the change and adjust all existing users, and lose the comment in hashtab.c. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- include/linux/bpf_local_storage.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b78b53198a2e..05b34a6355b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -124,7 +124,7 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, + int (*map_check_btf)(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); @@ -656,7 +656,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map) map->ops->map_seq_show_elem; } -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 85efa9772530..8157e8da61d4 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); -int bpf_local_storage_map_check_btf(const struct bpf_map *map, +int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); -- cgit v1.2.3 From 76e954155b45294c502e3d3a9e15757c858ca55e Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Fri, 27 Feb 2026 22:32:21 +0100 Subject: bpf: Introduce tnum_step to step through tnum's members This commit introduces tnum_step(), a function that, when given t, and a number z returns the smallest member of t larger than z. The number z must be greater or equal to the smallest member of t and less than the largest member of t. The first step is to compute j, a number that keeps all of t's known bits, and matches all unknown bits to z's bits. Since j is a member of the t, it is already a candidate for result. However, we want our result to be (minimally) greater than z. There are only two possible cases: (1) Case j <= z. In this case, we want to increase the value of j and make it > z. (2) Case j > z. In this case, we want to decrease the value of j while keeping it > z. (Case 1) j <= z t = xx11x0x0 z = 10111101 (189) j = 10111000 (184) ^ k (Case 1.1) Let's first consider the case where j < z. We will address j == z later. Since z > j, there had to be a bit position that was 1 in z and a 0 in j, beyond which all positions of higher significance are equal in j and z. Further, this position could not have been unknown in a, because the unknown positions of a match z. This position had to be a 1 in z and known 0 in t. Let k be position of the most significant 1-to-0 flip. In our example, k = 3 (starting the count at 1 at the least significant bit). Setting (to 1) the unknown bits of t in positions of significance smaller than k will not produce a result > z. Hence, we must set/unset the unknown bits at positions of significance higher than k. Specifically, we look for the next larger combination of 1s and 0s to place in those positions, relative to the combination that exists in z. We can achieve this by concatenating bits at unknown positions of t into an integer, adding 1, and writing the bits of that result back into the corresponding bit positions previously extracted from z. >From our example, considering only positions of significance greater than k: t = xx..x z = 10..1 + 1 ----- 11..0 This is the exact combination 1s and 0s we need at the unknown bits of t in positions of significance greater than k. Further, our result must only increase the value minimally above z. Hence, unknown bits in positions of significance smaller than k should remain 0. We finally have, result = 11110000 (240) (Case 1.2) Now consider the case when j = z, for example t = 1x1x0xxx z = 10110100 (180) j = 10110100 (180) Matching the unknown bits of the t to the bits of z yielded exactly z. To produce a number greater than z, we must set/unset the unknown bits in t, and *all* the unknown bits of t candidates for being set/unset. We can do this similar to Case 1.1, by adding 1 to the bits extracted from the masked bit positions of z. Essentially, this case is equivalent to Case 1.1, with k = 0. t = 1x1x0xxx z = .0.1.100 + 1 --------- .0.1.101 This is the exact combination of bits needed in the unknown positions of t. After recalling the known positions of t, we get result = 10110101 (181) (Case 2) j > z t = x00010x1 z = 10000010 (130) j = 10001011 (139) ^ k Since j > z, there had to be a bit position which was 0 in z, and a 1 in j, beyond which all positions of higher significance are equal in j and z. This position had to be a 0 in z and known 1 in t. Let k be the position of the most significant 0-to-1 flip. In our example, k = 4. Because of the 0-to-1 flip at position k, a member of t can become greater than z if the bits in positions greater than k are themselves >= to z. To make that member *minimally* greater than z, the bits in positions greater than k must be exactly = z. Hence, we simply match all of t's unknown bits in positions more significant than k to z's bits. In positions less significant than k, we set all t's unknown bits to 0 to retain minimality. In our example, in positions of greater significance than k (=4), t=x000. These positions are matched with z (1000) to produce 1000. In positions of lower significance than k, t=10x1. All unknown bits are set to 0 to produce 1001. The final result is: result = 10001001 (137) This concludes the computation for a result > z that is a member of t. The procedure for tnum_step() in this commit implements the idea described above. As a proof of correctness, we verified the algorithm against a logical specification of tnum_step. The specification asserts the following about the inputs t, z and output res that: 1. res is a member of t, and 2. res is strictly greater than z, and 3. there does not exist another value res2 such that 3a. res2 is also a member of t, and 3b. res2 is greater than z 3c. res2 is smaller than res We checked the implementation against this logical specification using an SMT solver. The verification formula in SMTLIB format is available at [1]. The verification returned an "unsat": indicating that no input assignment exists for which the implementation and the specification produce different outputs. In addition, we also automatically generated the logical encoding of the C implementation using Agni [2] and verified it against the same specification. This verification also returned an "unsat", confirming that the implementation is equivalent to the specification. The formula for this check is also available at [3]. Link: https://pastebin.com/raw/2eRWbiit [1] Link: https://github.com/bpfverif/agni [2] Link: https://pastebin.com/raw/EztVbBJ2 [3] Co-developed-by: Srinivas Narayana Signed-off-by: Srinivas Narayana Co-developed-by: Santosh Nagarakatte Signed-off-by: Santosh Nagarakatte Signed-off-by: Harishankar Vishwanathan Link: https://lore.kernel.org/r/93fdf71910411c0f19e282ba6d03b4c65f9c5d73.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index fa4654ffb621..ca2cfec8de08 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -131,4 +131,7 @@ static inline bool tnum_subreg_is_const(struct tnum a) return !(tnum_subreg(a)).mask; } +/* Returns the smallest member of t larger than z */ +u64 tnum_step(struct tnum t, u64 z); + #endif /* _LINUX_TNUM_H */ -- cgit v1.2.3 From 407fd8b8d8cce03856aa67329715de48b254b529 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 11 Feb 2026 19:03:03 +0100 Subject: KVM: remove CONFIG_KVM_GENERIC_MMU_NOTIFIER All architectures now use MMU notifier for KVM page table management. Remove the Kconfig symbol and the code that is used when it is disabled. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dde605cb894e..34759a262b28 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -253,7 +253,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { unsigned long attributes; }; @@ -275,7 +274,6 @@ struct kvm_gfn_range { bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -#endif enum { OUTSIDE_GUEST_MODE, @@ -849,13 +847,12 @@ struct kvm { struct hlist_head irq_ack_notifier_list; #endif -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; gfn_t mmu_invalidate_range_start; gfn_t mmu_invalidate_range_end; -#endif + struct list_head devices; u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; @@ -2118,7 +2115,6 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { if (unlikely(kvm->mmu_invalidate_in_progress)) @@ -2196,7 +2192,6 @@ static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; } -#endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING -- cgit v1.2.3