From af22bbe1f4a514c80b89a27252beef033168f4e9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:48 +0200 Subject: virtio: create admin queues alongside other virtqueues Admin virtqueue is just another virtqueue nothing that special about it. The current implementation treats it somehow separate though in terms of creation and deletion. Unify the admin virtqueue creation and deletion flows to be aligned with the rest of virtqueues, creating it from vp_find_vqs_*() helpers. Let the admin virtqueue to be deleted by vp_del_vqs() as the rest. Call vp_find_one_vq_msix() with slow_path argument being "true" to make sure that in case of limited interrupt vectors the config vector is used for admin queue. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-10-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index ab4b9a3fef6b..169c7d367fac 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -104,8 +104,6 @@ struct virtqueue_info { * Returns 0 on success or error status * If disable_vq_and_reset is set, then enable_vq_after_reset must also be * set. - * @create_avq: create admin virtqueue resource. - * @destroy_avq: destroy admin virtqueue resource. */ struct virtio_config_ops { void (*get)(struct virtio_device *vdev, unsigned offset, @@ -133,8 +131,6 @@ struct virtio_config_ops { struct virtio_shm_region *region, u8 id); int (*disable_vq_and_reset)(struct virtqueue *vq); int (*enable_vq_after_reset)(struct virtqueue *vq); - int (*create_avq)(struct virtio_device *vdev); - void (*destroy_avq)(struct virtio_device *vdev); }; /* If driver didn't advertise the feature, it will never appear. */ -- cgit v1.2.3 From 4c3b54af907e709609d3d8beca92d65e2f0cfd83 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:51 +0200 Subject: virtio_pci_modern: use completion instead of busy loop to wait on admin cmd result Currently, the code waits in a busy loop on every admin virtqueue issued command to get a reply. That prevents callers from issuing multiple commands in parallel. To overcome this limitation, introduce a virtqueue event callback for admin virtqueue. For every issued command, use completion mechanism to wait on a reply. In the event callback, trigger the completion is done for every incoming reply. Alongside with that, introduce a spin lock to protect the admin virtqueue operations. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-13-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 96fea920873b..999ff5934392 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -10,6 +10,7 @@ #include #include #include +#include /** * struct virtqueue - a queue to register buffers for sending or receiving. @@ -109,6 +110,8 @@ struct virtio_admin_cmd { __le64 group_member_id; struct scatterlist *data_sg; struct scatterlist *result_sg; + struct completion completion; + int ret; }; /** -- cgit v1.2.3 From e1a261ba599eec97e1c5c7760d5c3698fc24e6a6 Mon Sep 17 00:00:00 2001 From: Jocelyn Falempe Date: Tue, 2 Jul 2024 14:26:04 +0200 Subject: printk: Add a short description string to kmsg_dump() kmsg_dump doesn't forward the panic reason string to the kmsg_dumper callback. This patch adds a new struct kmsg_dump_detail, that will hold the reason and description, and pass it to the dump() callback. To avoid updating all kmsg_dump() call, it adds a kmsg_dump_desc() function and a macro for backward compatibility. I've written this for drm_panic, but it can be useful for other kmsg_dumper. It allows to see the panic reason, like "sysrq triggered crash" or "VFS: Unable to mount root fs on xxxx" on the drm panic screen. v2: * Use a struct kmsg_dump_detail to hold the reason and description pointer, for more flexibility if we want to add other parameters. (Kees Cook) * Fix powerpc/nvram_64 build, as I didn't update the forward declaration of oops_to_nvram() Signed-off-by: Jocelyn Falempe Acked-by: Petr Mladek Acked-by: Michael Ellerman (powerpc) Acked-by: Kees Cook Link: https://patchwork.freedesktop.org/patch/msgid/20240702122639.248110-1-jfalempe@redhat.com --- include/linux/kmsg_dump.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 906521c2329c..6055fc969877 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -39,6 +39,17 @@ struct kmsg_dump_iter { u64 next_seq; }; +/** + * struct kmsg_dump_detail - kernel crash detail + * @reason: reason for the crash, see kmsg_dump_reason. + * @description: optional short string, to provide additional information. + */ + +struct kmsg_dump_detail { + enum kmsg_dump_reason reason; + const char *description; +}; + /** * struct kmsg_dumper - kernel crash message dumper structure * @list: Entry in the dumper list (private) @@ -49,13 +60,13 @@ struct kmsg_dump_iter { */ struct kmsg_dumper { struct list_head list; - void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); + void (*dump)(struct kmsg_dumper *dumper, struct kmsg_dump_detail *detail); enum kmsg_dump_reason max_reason; bool registered; }; #ifdef CONFIG_PRINTK -void kmsg_dump(enum kmsg_dump_reason reason); +void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc); bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len); @@ -71,7 +82,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper); const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason); #else -static inline void kmsg_dump(enum kmsg_dump_reason reason) +static inline void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc) { } @@ -107,4 +118,9 @@ static inline const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) } #endif +static inline void kmsg_dump(enum kmsg_dump_reason reason) +{ + kmsg_dump_desc(reason, NULL); +} + #endif /* _LINUX_KMSG_DUMP_H */ -- cgit v1.2.3 From d20a9f568f99591886ec73b80ff7283486710643 Mon Sep 17 00:00:00 2001 From: Jocelyn Falempe Date: Wed, 17 Jul 2024 10:48:40 +0200 Subject: fbcon: Add an option to disable fbcon in panic This is required to avoid conflict between DRM_PANIC, and fbcon. If a drm device already handle panic with drm_panic, it should set the skip_panic field in fb_info, so that fbcon will stay quiet, and not overwrite the panic_screen. Signed-off-by: Jocelyn Falempe Reviewed-by: Javier Martinez Canillas Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20240717090102.968152-3-jfalempe@redhat.com --- include/linux/fb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index db7d97b10964..865dad03e73e 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -510,6 +510,7 @@ struct fb_info { void *par; bool skip_vt_switch; /* no VT switch on suspend/resume required */ + bool skip_panic; /* Do not write to the fb after a panic */ }; /* This will go away -- cgit v1.2.3 From d5e79eeba3086a52593b295ac4bf6eddd64d4aad Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Sat, 20 Jul 2024 15:15:42 +0800 Subject: dma-buf: heaps: Deduplicate docs and adopt common format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docs for dma_heap_get_name were incorrect, and since they were duplicated in the header they were wrong there too. The docs formatting was inconsistent so I tried to make it more consistent across functions since I'm already in here doing cleanup. Remove multiple unused includes and alphabetize. Signed-off-by: T.J. Mercier Signed-off-by: Yong Wu [Yong: Just add a comment for "priv" to mute build warning] Signed-off-by: Yunfei Dong Link: https://patchwork.freedesktop.org/patch/msgid/20240720071606.27930-5-yunfei.dong@mediatek.com Signed-off-by: Christian König --- include/linux/dma-heap.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h index 064bad725061..27d15f60950a 100644 --- a/include/linux/dma-heap.h +++ b/include/linux/dma-heap.h @@ -9,14 +9,13 @@ #ifndef _DMA_HEAPS_H #define _DMA_HEAPS_H -#include #include struct dma_heap; /** * struct dma_heap_ops - ops to operate on a given heap - * @allocate: allocate dmabuf and return struct dma_buf ptr + * @allocate: allocate dmabuf and return struct dma_buf ptr * * allocate returns dmabuf on success, ERR_PTR(-errno) on error. */ @@ -41,28 +40,10 @@ struct dma_heap_export_info { void *priv; }; -/** - * dma_heap_get_drvdata() - get per-heap driver data - * @heap: DMA-Heap to retrieve private data for - * - * Returns: - * The per-heap data for the heap. - */ void *dma_heap_get_drvdata(struct dma_heap *heap); -/** - * dma_heap_get_name() - get heap name - * @heap: DMA-Heap to retrieve private data for - * - * Returns: - * The char* for the heap name. - */ const char *dma_heap_get_name(struct dma_heap *heap); -/** - * dma_heap_add - adds a heap to dmabuf heaps - * @exp_info: information needed to register this heap - */ struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info); #endif /* _DMA_HEAPS_H */ -- cgit v1.2.3 From 564429a6bd8d26065b2cccffcaa9485359f74de7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:47 -0400 Subject: KVM: rename CONFIG_HAVE_KVM_GMEM_* to CONFIG_HAVE_KVM_ARCH_GMEM_* Add "ARCH" to the symbols; shortly, the "prepare" phase will include both the arch-independent step to clear out contents left in the page by the host, and the arch-dependent step enabled by CONFIG_HAVE_KVM_GMEM_PREPARE. For consistency do the same for CONFIG_HAVE_KVM_GMEM_INVALIDATE as well. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 689e8be873a7..344d90771844 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2445,7 +2445,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, } #endif /* CONFIG_KVM_PRIVATE_MEM */ -#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); #endif @@ -2477,7 +2477,7 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif -- cgit v1.2.3 From 7239ed74677af143857d1a96d402476446a0995a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:51 -0400 Subject: KVM: remove kvm_arch_gmem_prepare_needed() It is enough to return 0 if a guest need not do any preparation. This is in fact how sev_gmem_prepare() works for non-SNP guests, and it extends naturally to Intel hosts: the x86 callback for gmem_prepare is optional and returns 0 if not defined. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 344d90771844..45373d42f314 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2447,7 +2447,6 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); -bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); #endif /** -- cgit v1.2.3 From 4b5f67120a88c713b82907d55a767693382e9e9d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:54 -0400 Subject: KVM: extend kvm_range_has_memory_attributes() to check subset of attributes While currently there is no other attribute than KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM code such as kvm_mem_is_private() is written to expect their existence. Allow using kvm_range_has_memory_attributes() as a multi-page version of kvm_mem_is_private(), without it breaking later when more attributes are introduced. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 45373d42f314..c223b97df03e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2414,7 +2414,7 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn } bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, - unsigned long attrs); + unsigned long mask, unsigned long attrs); bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, -- cgit v1.2.3 From e4ee5447927377c55777b73fe497a2455a25f948 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 18:27:55 -0400 Subject: KVM: guest_memfd: let kvm_gmem_populate() operate only on private gfns This check is currently performed by sev_gmem_post_populate(), but it applies to all callers of kvm_gmem_populate(): the point of the function is that the memory is being encrypted and some work has to be done on all the gfns in order to encrypt them. Therefore, check the KVM_MEMORY_ATTRIBUTE_PRIVATE attribute prior to invoking the callback, and stop the operation if a shared page is encountered. Because CONFIG_KVM_PRIVATE_MEM in principle does not require attributes, this makes kvm_gmem_populate() depend on CONFIG_KVM_GENERIC_PRIVATE_MEM (which does require them). Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c223b97df03e..79a6b1a63027 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2449,6 +2449,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); #endif +#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM /** * kvm_gmem_populate() - Populate/prepare a GPA range with guest data * @@ -2475,6 +2476,7 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); +#endif #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); -- cgit v1.2.3 From 1a251f52cfdc417c84411a056bc142cbd77baef4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 15:49:18 -0700 Subject: minmax: make generic MIN() and MAX() macros available everywhere This just standardizes the use of MIN() and MAX() macros, with the very traditional semantics. The goal is to use these for C constant expressions and for top-level / static initializers, and so be able to simplify the min()/max() macros. These macro names were used by various kernel code - they are very traditional, after all - and all such users have been fixed up, with a few different approaches: - trivial duplicated macro definitions have been removed Note that 'trivial' here means that it's obviously kernel code that already included all the major kernel headers, and thus gets the new generic MIN/MAX macros automatically. - non-trivial duplicated macro definitions are guarded with #ifndef This is the "yes, they define their own versions, but no, the include situation is not entirely obvious, and maybe they don't get the generic version automatically" case. - strange use case #1 A couple of drivers decided that the way they want to describe their versioning is with #define MAJ 1 #define MIN 2 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) which adds zero value and I just did my Alexander the Great impersonation, and rewrote that pointless Gordian knot as #define DRV_VERSION "1.2" instead. - strange use case #2 A couple of drivers thought that it's a good idea to have a random 'MIN' or 'MAX' define for a value or index into a table, rather than the traditional macro that takes arguments. These values were re-written as C enum's instead. The new function-line macros only expand when followed by an open parenthesis, and thus don't clash with enum use. Happily, there weren't really all that many of these cases, and a lot of users already had the pattern of using '#ifndef' guarding (or in one case just using '#undef MIN') before defining their own private version that does the same thing. I left such cases alone. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 9c2848abc804..fc384714da45 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -277,6 +277,8 @@ static inline bool in_range32(u32 val, u32 start, u32 len) * Use these carefully: no type checking, and uses the arguments * multiple times. Use for obvious constants only. */ +#define MIN(a,b) __cmp(min,a,b) +#define MAX(a,b) __cmp(max,a,b) #define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b)) #define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b)) -- cgit v1.2.3 From dc1c8034e31b14a2e5e212104ec508aec44ce1b9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 20:24:12 -0700 Subject: minmax: simplify min()/max()/clamp() implementation Now that we no longer have any C constant expression contexts (ie array size declarations or static initializers) that use min() or max(), we can simpify the implementation by not having to worry about the result staying as a C constant expression. So now we can unconditionally just use temporary variables of the right type, and get rid of the excessive expansion that used to come from the use of __builtin_choose_expr(__is_constexpr(...), .. to pick the specialized code for constant expressions. Another expansion simplification is to pass the temporary variables (in addition to the original expression) to our __types_ok() macro. That may superficially look like it complicates the macro, but when we only want the type of the expression, expanding the temporary variable names is much simpler and smaller than expanding the potentially complicated original expression. As a result, on my machine, doing a $ time make drivers/staging/media/atomisp/pci/isp/kernels/ynr/ynr_1.0/ia_css_ynr.host.i goes from real 0m16.621s user 0m15.360s sys 0m1.221s to real 0m2.532s user 0m2.091s sys 0m0.452s because the token expansion goes down dramatically. In particular, the longest line expansion (which was line 71 of that 'ia_css_ynr.host.c' file) shrinks from 23,338kB (yes, 23MB for one single line) to "just" 1,444kB (now "only" 1.4MB). And yes, that line is still the line from hell, because it's doing multiple levels of "min()/max()" expansion thanks to some of them being hidden inside the uDIGIT_FITTING() macro. Lorenzo has a nice cleanup patch that makes that driver use inline functions instead of macros for sDIGIT_FITTING() and uDIGIT_FITTING(), which will fix that line once and for all, but the 16-fold reduction in this case does show why we need to simplify these helpers. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index fc384714da45..e3e4353df983 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -35,10 +35,10 @@ #define __is_noneg_int(x) \ (__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0) -#define __types_ok(x, y) \ - (__is_signed(x) == __is_signed(y) || \ - __is_signed((x) + 0) == __is_signed((y) + 0) || \ - __is_noneg_int(x) || __is_noneg_int(y)) +#define __types_ok(x, y, ux, uy) \ + (__is_signed(ux) == __is_signed(uy) || \ + __is_signed((ux) + 0) == __is_signed((uy) + 0) || \ + __is_noneg_int(x) || __is_noneg_int(y)) #define __cmp_op_min < #define __cmp_op_max > @@ -51,34 +51,31 @@ #define __cmp_once(op, type, x, y) \ __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) -#define __careful_cmp_once(op, x, y) ({ \ - static_assert(__types_ok(x, y), \ +#define __careful_cmp_once(op, x, y, ux, uy) ({ \ + __auto_type ux = (x); __auto_type uy = (y); \ + static_assert(__types_ok(x, y, ux, uy), \ #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ - __cmp_once(op, __auto_type, x, y); }) + __cmp(op, ux, uy); }) -#define __careful_cmp(op, x, y) \ - __builtin_choose_expr(__is_constexpr((x) - (y)), \ - __cmp(op, x, y), __careful_cmp_once(op, x, y)) +#define __careful_cmp(op, x, y) \ + __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) -#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({ \ - typeof(val) unique_val = (val); \ - typeof(lo) unique_lo = (lo); \ - typeof(hi) unique_hi = (hi); \ +#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ + __auto_type uval = (val); \ + __auto_type ulo = (lo); \ + __auto_type uhi = (hi); \ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error"); \ - static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error"); \ - __clamp(unique_val, unique_lo, unique_hi); }) - -#define __careful_clamp(val, lo, hi) ({ \ - __builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)), \ - __clamp(val, lo, hi), \ - __clamp_once(val, lo, hi, __UNIQUE_ID(__val), \ - __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); }) + static_assert(__types_ok(uval, lo, uval, ulo), "clamp() 'lo' signedness error"); \ + static_assert(__types_ok(uval, hi, uval, uhi), "clamp() 'hi' signedness error"); \ + __clamp(uval, ulo, uhi); }) + +#define __careful_clamp(val, lo, hi) \ + __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) /** * min - return minimum of two values of the same or compatible types -- cgit v1.2.3 From 2accfdb7eff65f390c4308b0e9cb7c3fe48ad63c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 10:58:28 -0700 Subject: profiling: attempt to remove per-cpu profile flip buffer This is the really old legacy kernel profiling code, which has long since been obviated by "real profiling" (ie 'prof' and company), and mainly remains as a source of syzbot reports. There are anecdotal reports that people still use it for boot-time profiling, but it's unlikely that such use would care about the old NUMA optimizations in this code from 2004 (commit ad02973d42: "profile: 512x Altix timer interrupt livelock fix" in the BK import archive at [1]) So in order to head off future syzbot reports, let's try to simplify this code and get rid of the per-cpu profile buffers that are quite a large portion of the complexity footprint of this thing (including CPU hotplug callbacks etc). It's unlikely anybody will actually notice, or possibly, as Thomas put it: "Only people who indulge in nostalgia will notice :)". That said, if it turns out that this code is actually actively used by somebody, we can always revert this removal. Thus the "attempt" in the summary line. [ Note: in a small nod to "the profiling code can cause NUMA problems", this also removes the "increment the last entry in the profiling array on any unknown hits" logic. That would account any program counter in a module to that single counter location, and might exacerbate any NUMA cacheline bouncing issues ] Link: https://lore.kernel.org/all/CAHk-=wgs52BxT4Zjmjz8aNvHWKxf5_ThBY4bYL1Y6CTaNL2dTw@mail.gmail.com/ Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git [1] Cc: Thomas Gleixner Cc: Tetsuo Handa Signed-off-by: Linus Torvalds --- include/linux/cpuhotplug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 51ba681b915a..affdd890899e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -100,7 +100,6 @@ enum cpuhp_state { CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, - CPUHP_PROFILE_PREPARE, CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, -- cgit v1.2.3 From 22f5468731491e53356ba7c028f0fdea20b18e2c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 30 Jul 2024 10:36:47 -0700 Subject: minmax: improve macro expansion and type checking This clarifies the rules for min()/max()/clamp() type checking and makes them a much more efficient macro expansion. In particular, we now look at the type and range of the inputs to see whether they work together, generating a mask of acceptable comparisons, and then just verifying that the inputs have a shared case: - an expression with a signed type can be used for (1) signed comparisons (2) unsigned comparisons if it is statically known to have a non-negative value - an expression with an unsigned type can be used for (3) unsigned comparison (4) signed comparisons if the type is smaller than 'int' and thus the C integer promotion rules will make it signed anyway Here rule (1) and (3) are obvious, and rule (2) is important in order to allow obvious trivial constants to be used together with unsigned values. Rule (4) is not necessarily a good idea, but matches what we used to do, and we have extant cases of this situation in the kernel. Notably with bcachefs having an expression like min(bch2_bucket_sectors_dirty(a), ca->mi.bucket_size) where bch2_bucket_sectors_dirty() returns an 's64', and 'ca->mi.bucket_size' is of type 'u16'. Technically that bcachefs comparison is clearly sensible on a C type level, because the 'u16' will go through the normal C integer promotion, and become 'int', and then we're comparing two signed values and everything looks sane. However, it's not entirely clear that a 'min(s64,u16)' operation makes a lot of conceptual sense, and it's possible that we will remove rule (4). After all, the _reason_ we have these complicated type checks is exactly that the C type promotion rules are not very intuitive. But at least for now the rule is in place for backwards compatibility. Also note that rule (2) existed before, but is hugely relaxed by this commit. It used to be true only for the simplest compile-time non-negative integer constants. The new macro model will allow cases where the compiler can trivially see that an expression is non-negative even if it isn't necessarily a constant. For example, the amdgpu driver does min_t(size_t, sizeof(fru_info->serial), pia[addr] & 0x3F)); because our old 'min()' macro would see that 'pia[addr] & 0x3F' is of type 'int' and clearly not a C constant expression, so doing a 'min()' with a 'size_t' is a signedness violation. Our new 'min()' macro still sees that 'pia[addr] & 0x3F' is of type 'int', but is smart enough to also see that it is clearly non-negative, and thus would allow that case without any complaints. Cc: Arnd Bergmann Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 9 ++++++ include/linux/minmax.h | 74 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 68 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 2594553bb30b..2df665fa2964 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -296,6 +296,15 @@ static inline void *offset_to_ptr(const int *off) #define is_signed_type(type) (((type)(-1)) < (__force type)1) #define is_unsigned_type(type) (!is_signed_type(type)) +/* + * Useful shorthand for "is this condition known at compile-time?" + * + * Note that the condition may involve non-constant values, + * but the compiler may know enough about the details of the + * values to determine that the condition is statically true. + */ +#define statically_true(x) (__builtin_constant_p(x) && (x)) + /* * This is needed in functions which generate the stack canary, see * arch/x86/kernel/smpboot.c::start_secondary() for an example. diff --git a/include/linux/minmax.h b/include/linux/minmax.h index e3e4353df983..41da6f85a407 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -26,19 +26,63 @@ #define __typecheck(x, y) \ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) -/* is_signed_type() isn't a constexpr for pointer types */ -#define __is_signed(x) \ - __builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))), \ - is_signed_type(typeof(x)), 0) +/* + * __sign_use for integer expressions: + * bit #0 set if ok for unsigned comparisons + * bit #1 set if ok for signed comparisons + * + * In particular, statically non-negative signed integer + * expressions are ok for both. + * + * NOTE! Unsigned types smaller than 'int' are implicitly + * converted to 'int' in expressions, and are accepted for + * signed conversions for now. This is debatable. + * + * Note that 'x' is the original expression, and 'ux' is + * the unique variable that contains the value. + * + * We use 'ux' for pure type checking, and 'x' for when + * we need to look at the value (but without evaluating + * it for side effects! Careful to only ever evaluate it + * with sizeof() or __builtin_constant_p() etc). + * + * Pointers end up being checked by the normal C type + * rules at the actual comparison, and these expressions + * only need to be careful to not cause warnings for + * pointer use. + */ +#define __signed_type_use(x,ux) (2+__is_nonneg(x,ux)) +#define __unsigned_type_use(x,ux) (1+2*(sizeof(ux)<4)) +#define __sign_use(x,ux) (is_signed_type(typeof(ux))? \ + __signed_type_use(x,ux):__unsigned_type_use(x,ux)) + +/* + * To avoid warnings about casting pointers to integers + * of different sizes, we need that special sign type. + * + * On 64-bit we can just always use 'long', since any + * integer or pointer type can just be cast to that. + * + * This does not work for 128-bit signed integers since + * the cast would truncate them, but we do not use s128 + * types in the kernel (we do use 'u128', but they will + * be handled by the !is_signed_type() case). + * + * NOTE! The cast is there only to avoid any warnings + * from when values that aren't signed integer types. + */ +#ifdef CONFIG_64BIT + #define __signed_type(ux) long +#else + #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux)>4,1LL,1L)) +#endif +#define __is_nonneg(x,ux) statically_true((__signed_type(ux))(x)>=0) -/* True for a non-negative signed int constant */ -#define __is_noneg_int(x) \ - (__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0) +#define __types_ok(x,y,ux,uy) \ + (__sign_use(x,ux) & __sign_use(y,uy)) -#define __types_ok(x, y, ux, uy) \ - (__is_signed(ux) == __is_signed(uy) || \ - __is_signed((ux) + 0) == __is_signed((uy) + 0) || \ - __is_noneg_int(x) || __is_noneg_int(y)) +#define __types_ok3(x,y,z,ux,uy,uz) \ + (__sign_use(x,ux) & __sign_use(y,uy) & __sign_use(z,uz)) #define __cmp_op_min < #define __cmp_op_max > @@ -53,8 +97,8 @@ #define __careful_cmp_once(op, x, y, ux, uy) ({ \ __auto_type ux = (x); __auto_type uy = (y); \ - static_assert(__types_ok(x, y, ux, uy), \ - #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ + BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy), \ + #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) #define __careful_cmp(op, x, y) \ @@ -70,8 +114,8 @@ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - static_assert(__types_ok(uval, lo, uval, ulo), "clamp() 'lo' signedness error"); \ - static_assert(__types_ok(uval, hi, uval, uhi), "clamp() 'hi' signedness error"); \ + BUILD_BUG_ON_MSG(!__types_ok3(val,lo,hi,uval,ulo,uhi), \ + "clamp("#val", "#lo", "#hi") signedness error"); \ __clamp(uval, ulo, uhi); }) #define __careful_clamp(val, lo, hi) \ -- cgit v1.2.3 From 89add40066f9ed9abe5f7f886fe5789ff7e0c50e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 29 Jul 2024 16:10:12 -0400 Subject: net: drop bad gso csum_start and offset in virtio_net_hdr Tighten csum_start and csum_offset checks in virtio_net_hdr_to_skb for GSO packets. The function already checks that a checksum requested with VIRTIO_NET_HDR_F_NEEDS_CSUM is in skb linear. But for GSO packets this might not hold for segs after segmentation. Syzkaller demonstrated to reach this warning in skb_checksum_help offset = skb_checksum_start_offset(skb); ret = -EINVAL; if (WARN_ON_ONCE(offset >= skb_headlen(skb))) By injecting a TSO packet: WARNING: CPU: 1 PID: 3539 at net/core/dev.c:3284 skb_checksum_help+0x3d0/0x5b0 ip_do_fragment+0x209/0x1b20 net/ipv4/ip_output.c:774 ip_finish_output_gso net/ipv4/ip_output.c:279 [inline] __ip_finish_output+0x2bd/0x4b0 net/ipv4/ip_output.c:301 iptunnel_xmit+0x50c/0x930 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x2296/0x2c70 net/ipv4/ip_tunnel.c:813 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x759/0xa60 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4850 [inline] netdev_start_xmit include/linux/netdevice.h:4864 [inline] xmit_one net/core/dev.c:3595 [inline] dev_hard_start_xmit+0x261/0x8c0 net/core/dev.c:3611 __dev_queue_xmit+0x1b97/0x3c90 net/core/dev.c:4261 packet_snd net/packet/af_packet.c:3073 [inline] The geometry of the bad input packet at tcp_gso_segment: [ 52.003050][ T8403] skb len=12202 headroom=244 headlen=12093 tailroom=0 [ 52.003050][ T8403] mac=(168,24) mac_len=24 net=(192,52) trans=244 [ 52.003050][ T8403] shinfo(txflags=0 nr_frags=1 gso(size=1552 type=3 segs=0)) [ 52.003050][ T8403] csum(0x60000c7 start=199 offset=1536 ip_summed=3 complete_sw=0 valid=0 level=0) Mitigate with stricter input validation. csum_offset: for GSO packets, deduce the correct value from gso_type. This is already done for USO. Extend it to TSO. Let UFO be: udp[46]_ufo_fragment ignores these fields and always computes the checksum in software. csum_start: finding the real offset requires parsing to the transport header. Do not add a parser, use existing segmentation parsing. Thanks to SKB_GSO_DODGY, that also catches bad packets that are hw offloaded. Again test both TSO and USO. Do not test UFO for the above reason, and do not test UDP tunnel offload. GSO packet are almost always CHECKSUM_PARTIAL. USO packets may be CHECKSUM_NONE since commit 10154dbded6d6 ("udp: Allow GSO transmit from devices with no checksum offload"), but then still these fields are initialized correctly in udp4_hwcsum/udp6_hwcsum_outgoing. So no need to test for ip_summed == CHECKSUM_PARTIAL first. This revises an existing fix mentioned in the Fixes tag, which broke small packets with GSO offload, as detected by kselftests. Link: https://syzkaller.appspot.com/bug?extid=e1db31216c789f552871 Link: https://lore.kernel.org/netdev/20240723223109.2196886-1-kuba@kernel.org Fixes: e269d79c7d35 ("net: missing check virtio") Cc: stable@vger.kernel.org Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20240729201108.1615114-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index d1d7825318c3..6c395a2600e8 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -56,7 +56,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, unsigned int thlen = 0; unsigned int p_off = 0; unsigned int ip_proto; - u64 ret, remainder, gso_size; if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { @@ -99,16 +98,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16)); - if (hdr->gso_size) { - gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); - ret = div64_u64_rem(skb->len, gso_size, &remainder); - if (!(ret && (hdr->gso_size > needed) && - ((remainder > needed) || (remainder == 0)))) { - return -EINVAL; - } - skb_shinfo(skb)->tx_flags |= SKBFL_SHARED_FRAG; - } - if (!pskb_may_pull(skb, needed)) return -EINVAL; @@ -182,6 +171,11 @@ retry: if (gso_type != SKB_GSO_UDP_L4) return -EINVAL; break; + case SKB_GSO_TCPV4: + case SKB_GSO_TCPV6: + if (skb->csum_offset != offsetof(struct tcphdr, check)) + return -EINVAL; + break; } /* Kernel has a special handling for GSO_BY_FRAGS. */ -- cgit v1.2.3 From 21b136cc63d2a9ddd60d4699552b69c214b32964 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 30 Jul 2024 15:44:16 -0700 Subject: minmax: fix up min3() and max3() too David Laight pointed out that we should deal with the min3() and max3() mess too, which still does excessive expansion. And our current macros are actually rather broken. In particular, the macros did this: #define min3(x, y, z) min((typeof(x))min(x, y), z) #define max3(x, y, z) max((typeof(x))max(x, y), z) and that not only is a nested expansion of possibly very complex arguments with all that involves, the typing with that "typeof()" cast is completely wrong. For example, imagine what happens in max3() if 'x' happens to be a 'unsigned char', but 'y' and 'z' are 'unsigned long'. The types are compatible, and there's no warning - but the result is just random garbage. No, I don't think we've ever hit that issue in practice, but since we now have sane infrastructure for doing this right, let's just use it. It fixes any excessive expansion, and also avoids these kinds of broken type issues. Requested-by: David Laight Acked-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 41da6f85a407..98008dd92153 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -152,13 +152,20 @@ #define umax(x, y) \ __careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull) +#define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ + __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ + BUILD_BUG_ON_MSG(!__types_ok3(x,y,z,ux,uy,uz), \ + #op"3("#x", "#y", "#z") signedness error"); \ + __cmp(op, ux, __cmp(op, uy, uz)); }) + /** * min3 - return minimum of three values * @x: first value * @y: second value * @z: third value */ -#define min3(x, y, z) min((typeof(x))min(x, y), z) +#define min3(x, y, z) \ + __careful_op3(min, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) /** * max3 - return maximum of three values @@ -166,7 +173,8 @@ * @y: second value * @z: third value */ -#define max3(x, y, z) max((typeof(x))max(x, y), z) +#define max3(x, y, z) \ + __careful_op3(max, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) /** * min_not_zero - return the minimum that is _not_ zero, unless both are zero -- cgit v1.2.3 From 3908ba2e0b2476e2ec13e15967bf6a37e449f2af Mon Sep 17 00:00:00 2001 From: Nick Hu Date: Wed, 17 Jul 2024 11:17:14 +0800 Subject: RISC-V: Enable the IPI before workqueue_online_cpu() Sometimes the hotplug cpu stalls at the arch_cpu_idle() for a while after workqueue_online_cpu(). When cpu stalls at the idle loop, the reschedule IPI is pending. However the enable bit is not enabled yet so the cpu stalls at WFI until watchdog timeout. Therefore enable the IPI before the workqueue_online_cpu() to fix the issue. Fixes: 63c5484e7495 ("workqueue: Add multiple affinity scopes and interface to select them") Signed-off-by: Nick Hu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20240717031714.1946036-1-nick.hu@sifive.com Signed-off-by: Palmer Dabbelt --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 51ba681b915a..e30d93b807d5 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -148,6 +148,7 @@ enum cpuhp_state { CPUHP_AP_IRQ_LOONGARCH_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, CPUHP_AP_IRQ_RISCV_IMSIC_STARTING, + CPUHP_AP_IRQ_RISCV_SBI_IPI_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, -- cgit v1.2.3 From 90ec3a8a7fd0d43026fcca979713e077d4883b56 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 2 Aug 2024 16:22:13 +0100 Subject: spi: Add empty versions of ACPI functions Provide empty versions of acpi_spi_count_resources(), acpi_spi_device_alloc() and acpi_spi_find_controller_by_adev() if the real functions are not being built. This commit fixes two problems with the original definitions: 1) There wasn't an empty version of these functions 2) The #if only depended on CONFIG_ACPI. But the functions are implemented in the core spi.c so CONFIG_SPI_MASTER must also be enabled for the real functions to exist. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20240802152215.20831-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e4f3f3d30a03..d47d5f14ff99 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -902,12 +902,29 @@ extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); extern void spi_unregister_controller(struct spi_controller *ctlr); -#if IS_ENABLED(CONFIG_ACPI) +#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SPI_MASTER) extern struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev); extern struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, struct acpi_device *adev, int index); int acpi_spi_count_resources(struct acpi_device *adev); +#else +static inline struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev) +{ + return NULL; +} + +static inline struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, + struct acpi_device *adev, + int index) +{ + return ERR_PTR(-ENODEV); +} + +static inline int acpi_spi_count_resources(struct acpi_device *adev) +{ + return 0; +} #endif /* -- cgit v1.2.3 From f17c06c6608ad4ecd2ccf321753fb511812d821b Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 2 Aug 2024 16:22:14 +0100 Subject: i2c: Fix conditional for substituting empty ACPI functions Add IS_ENABLED(CONFIG_I2C) to the conditional around a bunch of ACPI functions. The conditional around these functions depended only on CONFIG_ACPI. But the functions are implemented in I2C core, so are only present if CONFIG_I2C is enabled. Signed-off-by: Richard Fitzgerald Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 07e33bbc9256..7eedd0c662da 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1066,7 +1066,7 @@ static inline int of_i2c_get_board_info(struct device *dev, struct acpi_resource; struct acpi_resource_i2c_serialbus; -#if IS_ENABLED(CONFIG_ACPI) +#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_I2C) bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c); int i2c_acpi_client_count(struct acpi_device *adev); -- cgit v1.2.3 From b88f55389ad27f05ed84af9e1026aa64dbfabc9a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 4 Aug 2024 18:48:10 +0900 Subject: profiling: remove profile=sleep support The kernel sleep profile is no longer working due to a recursive locking bug introduced by commit 42a20f86dc19 ("sched: Add wrapper for get_wchan() to keep task blocked") Booting with the 'profile=sleep' kernel command line option added or executing # echo -n sleep > /sys/kernel/profiling after boot causes the system to lock up. Lockdep reports kthreadd/3 is trying to acquire lock: ffff93ac82e08d58 (&p->pi_lock){....}-{2:2}, at: get_wchan+0x32/0x70 but task is already holding lock: ffff93ac82e08d58 (&p->pi_lock){....}-{2:2}, at: try_to_wake_up+0x53/0x370 with the call trace being lock_acquire+0xc8/0x2f0 get_wchan+0x32/0x70 __update_stats_enqueue_sleeper+0x151/0x430 enqueue_entity+0x4b0/0x520 enqueue_task_fair+0x92/0x6b0 ttwu_do_activate+0x73/0x140 try_to_wake_up+0x213/0x370 swake_up_locked+0x20/0x50 complete+0x2f/0x40 kthread+0xfb/0x180 However, since nobody noticed this regression for more than two years, let's remove 'profile=sleep' support based on the assumption that nobody needs this functionality. Fixes: 42a20f86dc19 ("sched: Add wrapper for get_wchan() to keep task blocked") Cc: stable@vger.kernel.org # v5.16+ Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- include/linux/profile.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/profile.h b/include/linux/profile.h index 2fb487f61d12..3f53cdb0c27c 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -10,7 +10,6 @@ #define CPU_PROFILING 1 #define SCHED_PROFILING 2 -#define SLEEP_PROFILING 3 #define KVM_PROFILING 4 struct proc_dir_entry; -- cgit v1.2.3 From 0e8b53979ac86eddb3fd76264025a70071a25574 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 5 Aug 2024 14:01:21 +0900 Subject: bpf: kprobe: remove unused declaring of bpf_kprobe_override After the commit 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one"), "bpf_kprobe_override" is not used anywhere anymore, and we can remove it now. Link: https://lore.kernel.org/all/20240710085939.11520-1-dongml2@chinatelecom.cn/ Fixes: 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one") Signed-off-by: Menglong Dong Acked-by: Jiri Olsa Signed-off-by: Masami Hiramatsu (Google) --- include/linux/trace_events.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9df3e2973626..9435185c10ef 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -880,7 +880,6 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); -DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); -- cgit v1.2.3 From f91f7ac900e7342e0fd66093dfbf7cb8cb585a99 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 17 Jul 2024 15:00:23 +0200 Subject: refcount: Report UAF for refcount_sub_and_test(0) when counter==0 When a reference counter is at zero and refcount_sub_and_test() is invoked to subtract zero, the function accepts this request without any warning and returns true. This behavior does not seem ideal because the counter being already at zero indicates a use-after-free. Furthermore, returning true by refcount_sub_and_test() in this case potentially results in a double-free done by its caller. Modify the underlying function __refcount_sub_and_test() to warn about this case as a use-after-free and have it return false to avoid the potential double-free. Signed-off-by: Petr Pavlu Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240717130023.5675-1-petr.pavlu@suse.com Signed-off-by: Kees Cook --- include/linux/refcount.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 59b3b752394d..35f039ecb272 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -266,12 +266,12 @@ bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) if (oldp) *oldp = old; - if (old == i) { + if (old > 0 && old == i) { smp_acquire__after_ctrl_dep(); return true; } - if (unlikely(old < 0 || old - i < 0)) + if (unlikely(old <= 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; -- cgit v1.2.3 From 9a2fa1472083580b6c66bdaf291f591e1170123a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Aug 2024 18:02:00 -0400 Subject: fix bitmap corruption on close_range() with CLOSE_RANGE_UNSHARE copy_fd_bitmaps(new, old, count) is expected to copy the first count/BITS_PER_LONG bits from old->full_fds_bits[] and fill the rest with zeroes. What it does is copying enough words (BITS_TO_LONGS(count/BITS_PER_LONG)), then memsets the rest. That works fine, *if* all bits past the cutoff point are clear. Otherwise we are risking garbage from the last word we'd copied. For most of the callers that is true - expand_fdtable() has count equal to old->max_fds, so there's no open descriptors past count, let alone fully occupied words in ->open_fds[], which is what bits in ->full_fds_bits[] correspond to. The other caller (dup_fd()) passes sane_fdtable_size(old_fdt, max_fds), which is the smallest multiple of BITS_PER_LONG that covers all opened descriptors below max_fds. In the common case (copying on fork()) max_fds is ~0U, so all opened descriptors will be below it and we are fine, by the same reasons why the call in expand_fdtable() is safe. Unfortunately, there is a case where max_fds is less than that and where we might, indeed, end up with junk in ->full_fds_bits[] - close_range(from, to, CLOSE_RANGE_UNSHARE) with * descriptor table being currently shared * 'to' being above the current capacity of descriptor table * 'from' being just under some chunk of opened descriptors. In that case we end up with observably wrong behaviour - e.g. spawn a child with CLONE_FILES, get all descriptors in range 0..127 open, then close_range(64, ~0U, CLOSE_RANGE_UNSHARE) and watch dup(0) ending up with descriptor #128, despite #64 being observably not open. The minimally invasive fix would be to deal with that in dup_fd(). If this proves to add measurable overhead, we can go that way, but let's try to fix copy_fd_bitmaps() first. * new helper: bitmap_copy_and_expand(to, from, bits_to_copy, size). * make copy_fd_bitmaps() take the bitmap size in words, rather than bits; it's 'count' argument is always a multiple of BITS_PER_LONG, so we are not losing any information, and that way we can use the same helper for all three bitmaps - compiler will see that count is a multiple of BITS_PER_LONG for the large ones, so it'll generate plain memcpy()+memset(). Reproducer added to tools/testing/selftests/core/close_range_test.c Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- include/linux/bitmap.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 8c4768c44a01..d3b66d77df7a 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -270,6 +270,18 @@ static inline void bitmap_copy_clear_tail(unsigned long *dst, dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); } +static inline void bitmap_copy_and_extend(unsigned long *to, + const unsigned long *from, + unsigned int count, unsigned int size) +{ + unsigned int copy = BITS_TO_LONGS(count); + + memcpy(to, from, copy * sizeof(long)); + if (count % BITS_PER_LONG) + to[copy - 1] &= BITMAP_LAST_WORD_MASK(count); + memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long)); +} + /* * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64 * machines the order of hi and lo parts of numbers match the bitmap structure. -- cgit v1.2.3 From 6e2fdceffdc6bd7b8ba314a1d1b976721533c8f9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 26 Jul 2024 14:42:08 -0400 Subject: tracing: Use refcount for trace_event_file reference counter Instead of using an atomic counter for the trace_event_file reference counter, use the refcount interface. It has various checks to make sure the reference counting is correct, and will warn if it detects an error (like refcount_inc() on '0'). Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240726144208.687cce24@rorschach.local.home Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9df3e2973626..fed58e54f15e 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -680,7 +680,7 @@ struct trace_event_file { * caching and such. Which is mostly OK ;-) */ unsigned long flags; - atomic_t ref; /* ref count for opened files */ + refcount_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; -- cgit v1.2.3 From 58f7e4d7ba32758b861807e77535853cacc1f426 Mon Sep 17 00:00:00 2001 From: Jianhui Zhou <912460177@qq.com> Date: Mon, 5 Aug 2024 19:36:31 +0800 Subject: ring-buffer: Remove unused function ring_buffer_nr_pages() Because ring_buffer_nr_pages() is not an inline function and user accesses buffer->buffers[cpu]->nr_pages directly, the function ring_buffer_nr_pages is removed. Signed-off-by: Jianhui Zhou <912460177@qq.com> Link: https://lore.kernel.org/tencent_F4A7E9AB337F44E0F4B858D07D19EF460708@qq.com Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 96d2140b471e..fd35d4ec12e1 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -193,7 +193,6 @@ void ring_buffer_set_clock(struct trace_buffer *buffer, void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs); bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer); -size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu); size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu); struct buffer_data_read_page; -- cgit v1.2.3 From b54de55990b0467538c6bb33523b28816384958a Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 7 Aug 2024 17:06:12 +0100 Subject: net: ethtool: fix off-by-one error in max RSS context IDs Both ethtool_ops.rxfh_max_context_id and the default value used when it's not specified are supposed to be exclusive maxima (the former is documented as such; the latter, U32_MAX, cannot be used as an ID since it equals ETH_RXFH_CONTEXT_ALLOC), but xa_alloc() expects an inclusive maximum. Subtract one from 'limit' to produce an inclusive maximum, and pass that to xa_alloc(). Increase bnxt's max by one to prevent a (very minor) regression, as BNXT_MAX_ETH_RSS_CTX is an inclusive max. This is safe since bnxt is not actually hard-limited; BNXT_MAX_ETH_RSS_CTX is just a leftover from old driver code that managed context IDs itself. Rename rxfh_max_context_id to rxfh_max_num_contexts to make its semantics (hopefully) more obvious. Fixes: 847a8ab18676 ("net: ethtool: let the core choose RSS context IDs") Signed-off-by: Edward Cree Link: https://patch.msgid.link/5a2d11a599aa5b0cc6141072c01accfb7758650c.1723045898.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 303fda54ef17..989c94eddb2b 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -736,10 +736,10 @@ struct kernel_ethtool_ts_info { * @rxfh_key_space: same as @rxfh_indir_space, but for the key. * @rxfh_priv_size: size of the driver private data area the core should * allocate for an RSS context (in &struct ethtool_rxfh_context). - * @rxfh_max_context_id: maximum (exclusive) supported RSS context ID. If this - * is zero then the core may choose any (nonzero) ID, otherwise the core - * will only use IDs strictly less than this value, as the @rss_context - * argument to @create_rxfh_context and friends. + * @rxfh_max_num_contexts: maximum (exclusive) supported RSS context ID. + * If this is zero then the core may choose any (nonzero) ID, otherwise + * the core will only use IDs strictly less than this value, as the + * @rss_context argument to @create_rxfh_context and friends. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Modern drivers no @@ -954,7 +954,7 @@ struct ethtool_ops { u32 rxfh_indir_space; u16 rxfh_key_space; u16 rxfh_priv_size; - u32 rxfh_max_context_id; + u32 rxfh_max_num_contexts; u32 supported_coalesce_params; u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); -- cgit v1.2.3 From 5819e464a17587e6830cfab05f3e91a9a8753a41 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 8 Aug 2024 14:08:08 +1000 Subject: cpumask: Fix crash on updating CPU enabled mask The CPU enabled mask instead of the CPU possible mask should be used by set_cpu_enabled(). Otherwise, we run into crash due to write to the read-only CPU possible mask when vCPU is hot added on ARM64. (qemu) device_add host-arm-cpu,id=cpu1,socket-id=1 Unable to handle kernel write to read-only memory at virtual address ffff800080fa7190 : Call trace: register_cpu+0x1a4/0x2e8 arch_register_cpu+0x84/0xd8 acpi_processor_add+0x480/0x5b0 acpi_bus_attach+0x1c4/0x300 acpi_dev_for_one_check+0x3c/0x50 device_for_each_child+0x68/0xc8 acpi_dev_for_each_child+0x48/0x80 acpi_bus_attach+0x84/0x300 acpi_bus_scan+0x74/0x220 acpi_scan_rescan_bus+0x54/0x88 acpi_device_hotplug+0x208/0x478 acpi_hotplug_work_fn+0x2c/0x50 process_one_work+0x15c/0x3c0 worker_thread+0x2ec/0x400 kthread+0x120/0x130 ret_from_fork+0x10/0x20 Fix it by passing the CPU enabled mask instead of the CPU possible mask to set_cpu_enabled(). Fixes: 51c4767503d5 ("Merge tag 'bitmap-6.11-rc1' of https://github.com:/norov/linux") Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Signed-off-by: Yury Norov --- include/linux/cpumask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 801a7e524113..53158de44b83 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1037,7 +1037,7 @@ void init_cpu_online(const struct cpumask *src); assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) #define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) -#define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_possible_mask, (enabled)) +#define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) -- cgit v1.2.3 From 86509e38a80da34d7800985fa2be183475242c8c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 9 Aug 2024 15:50:35 +0200 Subject: file: fix typo in take_fd() comment The explanatory comment above take_fd() contains a typo, fix that to not confuse readers. Signed-off-by: Mathias Krause Link: https://lore.kernel.org/r/20240809135035.748109-1-minipli@grsecurity.net Signed-off-by: Christian Brauner --- include/linux/file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 237931f20739..59b146a14dca 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -110,7 +110,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), * * f = dentry_open(&path, O_RDONLY, current_cred()); * if (IS_ERR(f)) - * return PTR_ERR(fd); + * return PTR_ERR(f); * * fd_install(fd, f); * return take_fd(fd); -- cgit v1.2.3 From 7b589a9b45ae32aa9d7bece597490e141198d7a6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Aug 2024 19:38:46 +0100 Subject: netfs: Fix handling of USE_PGPRIV2 and WRITE_TO_CACHE flags The NETFS_RREQ_USE_PGPRIV2 and NETFS_RREQ_WRITE_TO_CACHE flags aren't used correctly. The problem is that we try to set them up in the request initialisation, but we the cache may be in the process of setting up still, and so the state may not be correct. Further, we secondarily sample the cache state and make contradictory decisions later. The issue arises because we set up the cache resources, which allows the cache's ->prepare_read() to switch on NETFS_SREQ_COPY_TO_CACHE - which triggers cache writing even if we didn't set the flags when allocating. Fix this in the following way: (1) Drop NETFS_ICTX_USE_PGPRIV2 and instead set NETFS_RREQ_USE_PGPRIV2 in ->init_request() rather than trying to juggle that in netfs_alloc_request(). (2) Repurpose NETFS_RREQ_USE_PGPRIV2 to merely indicate that if caching is to be done, then PG_private_2 is to be used rather than only setting it if we decide to cache and then having netfs_rreq_unlock_folios() set the non-PG_private_2 writeback-to-cache if it wasn't set. (3) Split netfs_rreq_unlock_folios() into two functions, one of which contains the deprecated code for using PG_private_2 to avoid accidentally doing the writeback path - and always use it if USE_PGPRIV2 is set. (4) As NETFS_ICTX_USE_PGPRIV2 is removed, make netfs_write_begin() always wait for PG_private_2. This function is deprecated and only used by ceph anyway, and so label it so. (5) Drop the NETFS_RREQ_WRITE_TO_CACHE flag and use fscache_operation_valid() on the cache_resources instead. This has the advantage of picking up the result of netfs_begin_cache_read() and fscache_begin_write_operation() - which are called after the object is initialised and will wait for the cache to come to a usable state. Just reverting ae678317b95e[1] isn't a sufficient fix, so this need to be applied on top of that. Without this as well, things like: rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { and: WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386 may happen, along with some UAFs due to PG_private_2 not getting used to wait on writeback completion. Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty") Reported-by: Max Kellermann Signed-off-by: David Howells cc: Ilya Dryomov cc: Xiubo Li cc: Hristo Venev cc: Jeff Layton cc: Matthew Wilcox cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk/ [1] Link: https://lore.kernel.org/r/1173209.1723152682@warthog.procyon.org.uk Signed-off-by: Christian Brauner --- include/linux/netfs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5d0288938cc2..983816608f15 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,8 +73,6 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ -#define NETFS_ICTX_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark - * write to cache on read */ }; /* @@ -269,7 +267,6 @@ struct netfs_io_request { #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ -#define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ -- cgit v1.2.3 From fdad456cbcca739bae1849549c7a999857c56f88 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 28 Jul 2024 19:46:11 +0800 Subject: bpf: Fix updating attached freplace prog in prog_array map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit f7866c358733 ("bpf: Fix null pointer dereference in resolve_prog_type() for BPF_PROG_TYPE_EXT") fixed a NULL pointer dereference panic, but didn't fix the issue that fails to update attached freplace prog to prog_array map. Since commit 1c123c567fb1 ("bpf: Resolve fext program type when checking map compatibility"), freplace prog and its target prog are able to tail call each other. And the commit 3aac1ead5eb6 ("bpf: Move prog->aux->linked_prog and trampoline into bpf_link on attach") sets prog->aux->dst_prog as NULL after attaching freplace prog to its target prog. After loading freplace the prog_array's owner type is BPF_PROG_TYPE_SCHED_CLS. Then, after attaching freplace its prog->aux->dst_prog is NULL. Then, while updating freplace in prog_array the bpf_prog_map_compatible() incorrectly returns false because resolve_prog_type() returns BPF_PROG_TYPE_EXT instead of BPF_PROG_TYPE_SCHED_CLS. After this patch the resolve_prog_type() returns BPF_PROG_TYPE_SCHED_CLS and update to prog_array can succeed. Fixes: f7866c358733 ("bpf: Fix null pointer dereference in resolve_prog_type() for BPF_PROG_TYPE_EXT") Cc: Toke Høiland-Jørgensen Cc: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240728114612.48486-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6503c85b10a3..7b776dae36e5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -856,8 +856,8 @@ static inline u32 type_flag(u32 type) /* only use after check_attach_btf_id() */ static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { - return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->dst_prog) ? - prog->aux->dst_prog->type : prog->type; + return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->saved_dst_prog_type) ? + prog->aux->saved_dst_prog_type : prog->type; } static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) -- cgit v1.2.3 From 92567a5f92bc947fb7aa4351979db1b7b71a554c Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 8 Aug 2024 22:06:19 +0800 Subject: iommu: Remove unused declaration iommu_sva_unbind_gpasid() Commit 0c9f17877891 ("iommu: Remove guest pasid related interfaces and definitions") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240808140619.2498535-1-yuehaibing@huawei.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4d47f2c33311..04cbdae0052e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -795,8 +795,6 @@ extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); -extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, -- cgit v1.2.3 From 2a0629834cd82f05d424bbc193374f9a43d1f87d Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 9 Aug 2024 11:16:28 +0800 Subject: vfs: Don't evict inode under the inode lru traversing context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The inode reclaiming process(See function prune_icache_sb) collects all reclaimable inodes and mark them with I_FREEING flag at first, at that time, other processes will be stuck if they try getting these inodes (See function find_inode_fast), then the reclaiming process destroy the inodes by function dispose_list(). Some filesystems(eg. ext4 with ea_inode feature, ubifs with xattr) may do inode lookup in the inode evicting callback function, if the inode lookup is operated under the inode lru traversing context, deadlock problems may happen. Case 1: In function ext4_evict_inode(), the ea inode lookup could happen if ea_inode feature is enabled, the lookup process will be stuck under the evicting context like this: 1. File A has inode i_reg and an ea inode i_ea 2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea 3. Then, following three processes running like this: PA PB echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // i_reg is added into lru, lru->i_ea->i_reg prune_icache_sb list_lru_walk_one inode_lru_isolate i_ea->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(i_reg) spin_unlock(&i_reg->i_lock) spin_unlock(lru_lock) rm file A i_reg->nlink = 0 iput(i_reg) // i_reg->nlink is 0, do evict ext4_evict_inode ext4_xattr_delete_inode ext4_xattr_inode_dec_ref_all ext4_xattr_inode_iget ext4_iget(i_ea->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(i_ea) ----→ AA deadlock dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&i_ea->i_state) Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file deleting process holds BASEHD's wbuf->io_mutex while getting the xattr inode, which could race with inode reclaiming process(The reclaiming process could try locking BASEHD's wbuf->io_mutex in inode evicting function), then an ABBA deadlock problem would happen as following: 1. File A has inode ia and a xattr(with inode ixa), regular file B has inode ib and a xattr. 2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa 3. Then, following three processes running like this: PA PB PC echo 2 > /proc/sys/vm/drop_caches shrink_slab prune_dcache_sb // ib and ia are added into lru, lru->ixa->ib->ia prune_icache_sb list_lru_walk_one inode_lru_isolate ixa->i_state |= I_FREEING // set inode state inode_lru_isolate __iget(ib) spin_unlock(&ib->i_lock) spin_unlock(lru_lock) rm file B ib->nlink = 0 rm file A iput(ia) ubifs_evict_inode(ia) ubifs_jnl_delete_inode(ia) ubifs_jnl_write_inode(ia) make_reservation(BASEHD) // Lock wbuf->io_mutex ubifs_iget(ixa->i_ino) iget_locked find_inode_fast __wait_on_freeing_inode(ixa) | iput(ib) // ib->nlink is 0, do evict | ubifs_evict_inode | ubifs_jnl_delete_inode(ib) ↓ ubifs_jnl_write_inode ABBA deadlock ←-----make_reservation(BASEHD) dispose_list // cannot be executed by prune_icache_sb wake_up_bit(&ixa->i_state) Fix the possible deadlock by using new inode state flag I_LRU_ISOLATING to pin the inode in memory while inode_lru_isolate() reclaims its pages instead of using ordinary inode reference. This way inode deletion cannot be triggered from inode_lru_isolate() thus avoiding the deadlock. evict() is made to wait for I_LRU_ISOLATING to be cleared before proceeding with inode cleanup. Link: https://lore.kernel.org/all/37c29c42-7685-d1f0-067d-63582ffac405@huaweicloud.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022 Fixes: e50e5129f384 ("ext4: xattr-in-inode support") Fixes: 7959cf3a7506 ("ubifs: journal: Handle xattrs like files") Cc: stable@vger.kernel.org Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240809031628.1069873-1-chengzhihao@huaweicloud.com Reviewed-by: Jan Kara Suggested-by: Jan Kara Suggested-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..fb0426f349fc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2392,6 +2392,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2415,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define I_DONTCACHE (1 << 16) #define I_SYNC_QUEUED (1 << 17) #define I_PINNING_NETFS_WB (1 << 18) +#define __I_LRU_ISOLATING 19 +#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -- cgit v1.2.3 From bcc954c6caba01fca143162d5fbb90e46aa1ad80 Mon Sep 17 00:00:00 2001 From: Ryo Takakura Date: Mon, 12 Aug 2024 16:27:03 +0900 Subject: printk/panic: Allow cpu backtraces to be written into ringbuffer during panic commit 779dbc2e78d7 ("printk: Avoid non-panic CPUs writing to ringbuffer") disabled non-panic CPUs to further write messages to ringbuffer after panicked. Since the commit, non-panicked CPU's are not allowed to write to ring buffer after panicked and CPU backtrace which is triggered after panicked to sample non-panicked CPUs' backtrace no longer serves its function as it has nothing to print. Fix the issue by allowing non-panicked CPUs to write into ringbuffer while CPU backtrace is in flight. Fixes: 779dbc2e78d7 ("printk: Avoid non-panic CPUs writing to ringbuffer") Signed-off-by: Ryo Takakura Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240812072703.339690-1-takakura@valinux.co.jp Signed-off-by: Petr Mladek --- include/linux/panic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/panic.h b/include/linux/panic.h index 3130e0b5116b..54d90b6c5f47 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -16,6 +16,7 @@ extern void oops_enter(void); extern void oops_exit(void); extern bool oops_may_print(void); +extern bool panic_triggering_all_cpu_backtrace; extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; -- cgit v1.2.3 From 66155de93bcf4f2967e602a4b3bf7ebe58f34b11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:02:58 -0700 Subject: KVM: x86: Disallow read-only memslots for SEV-ES and SEV-SNP (and TDX) Disallow read-only memslots for SEV-{ES,SNP} VM types, as KVM can't directly emulate instructions for ES/SNP, and instead the guest must explicitly request emulation. Unless the guest explicitly requests emulation without accessing memory, ES/SNP relies on KVM creating an MMIO SPTE, with the subsequent #NPF being reflected into the guest as a #VC. But for read-only memslots, KVM deliberately doesn't create MMIO SPTEs, because except for ES/SNP, doing so requires setting reserved bits in the SPTE, i.e. the SPTE can't be readable while also generating a #VC on writes. Because KVM never creates MMIO SPTEs and jumps directly to emulation, the guest never gets a #VC. And since KVM simply resumes the guest if ES/SNP guests trigger emulation, KVM effectively puts the vCPU into an infinite #NPF loop if the vCPU attempts to write read-only memory. Disallow read-only memory for all VMs with protected state, i.e. for upcoming TDX VMs as well as ES/SNP VMs. For TDX, it's actually possible to support read-only memory, as TDX uses EPT Violation #VE to reflect the fault into the guest, e.g. KVM could configure read-only SPTEs with RX protections and SUPPRESS_VE=0. But there is no strong use case for supporting read-only memslots on TDX, e.g. the main historical usage is to emulate option ROMs, but TDX disallows executing from shared memory. And if someone comes along with a legitimate, strong use case, the restriction can always be lifted for TDX. Don't bother trying to retroactively apply the restriction to SEV-ES VMs that are created as type KVM_X86_DEFAULT_VM. Read-only memslots can't possibly work for SEV-ES, i.e. disallowing such memslots is really just means reporting an error to userspace instead of silently hanging vCPUs. Trying to deal with the ordering between KVM_SEV_INIT and memslot creation isn't worth the marginal benefit it would provide userspace. Fixes: 26c44aa9e076 ("KVM: SEV: define VM types for SEV and SEV-ES") Fixes: 1dfe571c12cf ("KVM: SEV: Add initial SEV-SNP support") Cc: Peter Gonda Cc: Michael Roth Cc: Vishal Annapurve Cc: Ackerly Tng Signed-off-by: Sean Christopherson Message-ID: <20240809190319.1710470-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 79a6b1a63027..b23c6d48392f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -715,6 +715,13 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) } #endif +#ifndef kvm_arch_has_readonly_mem +static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) +{ + return IS_ENABLED(CONFIG_HAVE_KVM_READONLY_MEM); +} +#endif + struct kvm_memslots { u64 generation; atomic_long_t last_used_slot; -- cgit v1.2.3 From 71833e79a42178d8a50b5081c98c78ace9325628 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 14 Aug 2024 13:16:49 +0100 Subject: i2c: Use IS_REACHABLE() for substituting empty ACPI functions Replace IS_ENABLED() with IS_REACHABLE() to substitute empty stubs for: i2c_acpi_get_i2c_resource() i2c_acpi_client_count() i2c_acpi_find_bus_speed() i2c_acpi_new_device_by_fwnode() i2c_adapter *i2c_acpi_find_adapter_by_handle() i2c_acpi_waive_d0_probe() commit f17c06c6608a ("i2c: Fix conditional for substituting empty ACPI functions") partially fixed this conditional to depend on CONFIG_I2C, but used IS_ENABLED(), which is wrong since CONFIG_I2C is tristate. CONFIG_ACPI is boolean but let's also change it to use IS_REACHABLE() to future-proof it against becoming tristate. Somehow despite testing various combinations of CONFIG_I2C and CONFIG_ACPI we missed the combination CONFIG_I2C=m, CONFIG_ACPI=y. Signed-off-by: Richard Fitzgerald Fixes: f17c06c6608a ("i2c: Fix conditional for substituting empty ACPI functions") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408141333.gYnaitcV-lkp@intel.com/ Reviewed-by: Takashi Iwai Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 7eedd0c662da..377def497298 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1066,7 +1066,7 @@ static inline int of_i2c_get_board_info(struct device *dev, struct acpi_resource; struct acpi_resource_i2c_serialbus; -#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_I2C) +#if IS_REACHABLE(CONFIG_ACPI) && IS_REACHABLE(CONFIG_I2C) bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c); int i2c_acpi_client_count(struct acpi_device *adev); -- cgit v1.2.3 From 5f75cfbd6bb02295ddaed48adf667b6c828ce07b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 1 Aug 2024 22:47:48 +0200 Subject: mm/hugetlb: fix hugetlb vs. core-mm PT locking We recently made GUP's common page table walking code to also walk hugetlb VMAs without most hugetlb special-casing, preparing for the future of having less hugetlb-specific page table walking code in the codebase. Turns out that we missed one page table locking detail: page table locking for hugetlb folios that are not mapped using a single PMD/PUD. Assume we have hugetlb folio that spans multiple PTEs (e.g., 64 KiB hugetlb folios on arm64 with 4 KiB base page size). GUP, as it walks the page tables, will perform a pte_offset_map_lock() to grab the PTE table lock. However, hugetlb that concurrently modifies these page tables would actually grab the mm->page_table_lock: with USE_SPLIT_PTE_PTLOCKS, the locks would differ. Something similar can happen right now with hugetlb folios that span multiple PMDs when USE_SPLIT_PMD_PTLOCKS. This issue can be reproduced [1], for example triggering: [ 3105.936100] ------------[ cut here ]------------ [ 3105.939323] WARNING: CPU: 31 PID: 2732 at mm/gup.c:142 try_grab_folio+0x11c/0x188 [ 3105.944634] Modules linked in: [...] [ 3105.974841] CPU: 31 PID: 2732 Comm: reproducer Not tainted 6.10.0-64.eln141.aarch64 #1 [ 3105.980406] Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-4.fc40 05/24/2024 [ 3105.986185] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 3105.991108] pc : try_grab_folio+0x11c/0x188 [ 3105.994013] lr : follow_page_pte+0xd8/0x430 [ 3105.996986] sp : ffff80008eafb8f0 [ 3105.999346] x29: ffff80008eafb900 x28: ffffffe8d481f380 x27: 00f80001207cff43 [ 3106.004414] x26: 0000000000000001 x25: 0000000000000000 x24: ffff80008eafba48 [ 3106.009520] x23: 0000ffff9372f000 x22: ffff7a54459e2000 x21: ffff7a546c1aa978 [ 3106.014529] x20: ffffffe8d481f3c0 x19: 0000000000610041 x18: 0000000000000001 [ 3106.019506] x17: 0000000000000001 x16: ffffffffffffffff x15: 0000000000000000 [ 3106.024494] x14: ffffb85477fdfe08 x13: 0000ffff9372ffff x12: 0000000000000000 [ 3106.029469] x11: 1fffef4a88a96be1 x10: ffff7a54454b5f0c x9 : ffffb854771b12f0 [ 3106.034324] x8 : 0008000000000000 x7 : ffff7a546c1aa980 x6 : 0008000000000080 [ 3106.038902] x5 : 00000000001207cf x4 : 0000ffff9372f000 x3 : ffffffe8d481f000 [ 3106.043420] x2 : 0000000000610041 x1 : 0000000000000001 x0 : 0000000000000000 [ 3106.047957] Call trace: [ 3106.049522] try_grab_folio+0x11c/0x188 [ 3106.051996] follow_pmd_mask.constprop.0.isra.0+0x150/0x2e0 [ 3106.055527] follow_page_mask+0x1a0/0x2b8 [ 3106.058118] __get_user_pages+0xf0/0x348 [ 3106.060647] faultin_page_range+0xb0/0x360 [ 3106.063651] do_madvise+0x340/0x598 Let's make huge_pte_lockptr() effectively use the same PT locks as any core-mm page table walker would. Add ptep_lockptr() to obtain the PTE page table lock using a pte pointer -- unfortunately we cannot convert pte_lockptr() because virt_to_page() doesn't work with kmap'ed page tables we can have with CONFIG_HIGHPTE. Handle CONFIG_PGTABLE_LEVELS correctly by checking in reverse order, such that when e.g., CONFIG_PGTABLE_LEVELS==2 with PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE will work as expected. Document why that works. There is one ugly case: powerpc 8xx, whereby we have an 8 MiB hugetlb folio being mapped using two PTE page tables. While hugetlb wants to take the PMD table lock, core-mm would grab the PTE table lock of one of both PTE page tables. In such corner cases, we have to make sure that both locks match, which is (fortunately!) currently guaranteed for 8xx as it does not support SMP and consequently doesn't use split PT locks. [1] https://lore.kernel.org/all/1bbfcc7f-f222-45a5-ac44-c5a1381c596d@redhat.com/ Link: https://lkml.kernel.org/r/20240801204748.99107-1-david@redhat.com Fixes: 9cb28da54643 ("mm/gup: handle hugetlb in the generic follow_page_mask code") Signed-off-by: David Hildenbrand Acked-by: Peter Xu Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Peter Xu Cc: Oscar Salvador Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 33 ++++++++++++++++++++++++++++++--- include/linux/mm.h | 11 +++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c9bf68c239a0..45bf05ad5c53 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -944,10 +944,37 @@ static inline bool htlb_allow_alloc_fallback(int reason) static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { - if (huge_page_size(h) == PMD_SIZE) + const unsigned long size = huge_page_size(h); + + VM_WARN_ON(size == PAGE_SIZE); + + /* + * hugetlb must use the exact same PT locks as core-mm page table + * walkers would. When modifying a PTE table, hugetlb must take the + * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD + * PT lock etc. + * + * The expectation is that any hugetlb folio smaller than a PMD is + * always mapped into a single PTE table and that any hugetlb folio + * smaller than a PUD (but at least as big as a PMD) is always mapped + * into a single PMD table. + * + * If that does not hold for an architecture, then that architecture + * must disable split PT locks such that all *_lockptr() functions + * will give us the same result: the per-MM PT lock. + * + * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where + * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr() + * and core-mm would use pmd_lockptr(). However, in such configurations + * split PMD locks are disabled -- they don't make sense on a single + * PGDIR page table -- and the end result is the same. + */ + if (size >= PUD_SIZE) + return pud_lockptr(mm, (pud_t *) pte); + else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE)) return pmd_lockptr(mm, (pmd_t *) pte); - VM_BUG_ON(huge_page_size(h) == PAGE_SIZE); - return &mm->page_table_lock; + /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */ + return ptep_lockptr(mm, pte); } #ifndef hugepages_supported diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76..6549d0979b28 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2920,6 +2920,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); + BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); + return ptlock_ptr(virt_to_ptdesc(pte)); +} + static inline bool ptlock_init(struct ptdesc *ptdesc) { /* @@ -2944,6 +2951,10 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + return &mm->page_table_lock; +} static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} -- cgit v1.2.3 From f4cb78af91e3b2b7aa76dbf8213b898fa8811b12 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 8 Aug 2024 21:34:35 +0000 Subject: mm: add system wide stats items category /proc/vmstat contains events and stats, events can only grow, but stats can grow and shrink. vmstat has the following: ------------------------- NR_VM_ZONE_STAT_ITEMS: per-zone stats NR_VM_NUMA_EVENT_ITEMS: per-numa events NR_VM_NODE_STAT_ITEMS: per-numa stats NR_VM_WRITEBACK_STAT_ITEMS: system-wide background-writeback and dirty-throttling tresholds. NR_VM_EVENT_ITEMS: system-wide events ------------------------- Rename NR_VM_WRITEBACK_STAT_ITEMS to NR_VM_STAT_ITEMS, to track the system-wide stats, we are going to add per-page metadata stats to this category in the next patch. Also delete unused writeback_stat_name(). Link: https://lkml.kernel.org/r/20240809191020.1142142-2-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-3-pasha.tatashin@soleen.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Pasha Tatashin Suggested-by: Yosry Ahmed Tested-by: Alison Schofield Acked-by: David Hildenbrand Acked-by: David Rientjes Cc: Dan Williams Cc: Domenico Cerasuolo Cc: Joel Granados Cc: Johannes Weiner Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yi Zhang Cc: Fan Ni Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 23cd17942036..9ab4fa5e09b5 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -34,10 +34,11 @@ struct reclaim_stat { unsigned nr_lazyfree_fail; }; -enum writeback_stat_item { +/* Stat data for system wide items */ +enum vm_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, - NR_VM_WRITEBACK_STAT_ITEMS, + NR_VM_STAT_ITEMS, }; #ifdef CONFIG_VM_EVENT_COUNTERS @@ -514,21 +515,13 @@ static inline const char *lru_list_name(enum lru_list lru) return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } -static inline const char *writeback_stat_name(enum writeback_stat_item item) -{ - return vmstat_text[NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_EVENT_ITEMS + - NR_VM_NODE_STAT_ITEMS + - item]; -} - #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + - NR_VM_WRITEBACK_STAT_ITEMS + + NR_VM_STAT_ITEMS + item]; } #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ -- cgit v1.2.3 From 9d85731110241fb8ca9445ea4177d816041a8825 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 8 Aug 2024 21:34:36 +0000 Subject: mm: don't account memmap per-node Fix invalid access to pgdat during hot-remove operation: ndctl users reported a GPF when trying to destroy a namespace: $ ndctl destroy-namespace all -r all -f Segmentation fault dmesg: Oops: general protection fault, probably for non-canonical address 0xdffffc0000005650: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: probably user-memory-access in range [0x000000000002b280-0x000000000002b287] CPU: 26 UID: 0 PID: 1868 Comm: ndctl Not tainted 6.11.0-rc1 #1 Hardware name: Dell Inc. PowerEdge R640/08HT8T, BIOS 2.20.1 09/13/2023 RIP: 0010:mod_node_page_state+0x2a/0x110 cxl-test users report a GPF when trying to unload the test module: $ modrpobe -r cxl-test dmesg BUG: unable to handle page fault for address: 0000000000004200 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 UID: 0 PID: 1076 Comm: modprobe Tainted: G O N 6.11.0-rc1 #197 Tainted: [O]=OOT_MODULE, [N]=TEST Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/15 RIP: 0010:mod_node_page_state+0x6/0x90 Currently, when memory is hot-plugged or hot-removed the accounting is done based on the assumption that memmap is allocated from the same node as the hot-plugged/hot-removed memory, which is not always the case. In addition, there are challenges with keeping the node id of the memory that is being remove to the time when memmap accounting is actually performed: since this is done after remove_pfn_range_from_zone(), and also after remove_memory_block_devices(). Meaning that we cannot use pgdat nor walking though memblocks to get the nid. Given all of that, account the memmap overhead system wide instead. For this we are going to be using global atomic counters, but given that memmap size is rarely modified, and normally is only modified either during early boot when there is only one CPU, or under a hotplug global mutex lock, therefore there is no need for per-cpu optimizations. Also, while we are here rename nr_memmap to nr_memmap_pages, and nr_memmap_boot to nr_memmap_boot_pages to be self explanatory that the units are in page count. [pasha.tatashin@soleen.com: address a few nits from David Hildenbrand] Link: https://lkml.kernel.org/r/20240809191020.1142142-4-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240809191020.1142142-4-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240808213437.682006-4-pasha.tatashin@soleen.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Pasha Tatashin Reported-by: Yi Zhang Closes: https://lore.kernel.org/linux-cxl/CAHj4cs9Ax1=CoJkgBGP_+sNu6-6=6v=_L-ZBZY0bVLD3wUWZQg@mail.gmail.com Reported-by: Alison Schofield Closes: https://lore.kernel.org/linux-mm/Zq0tPd2h6alFz8XF@aschofie-mobl2/#t Tested-by: Dan Williams Tested-by: Alison Schofield Acked-by: David Hildenbrand Acked-by: David Rientjes Tested-by: Yi Zhang Cc: Domenico Cerasuolo Cc: Fan Ni Cc: Joel Granados Cc: Johannes Weiner Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 -- include/linux/vmstat.h | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 41458892bc8a..1dc6248feb83 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,8 +220,6 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, - NR_MEMMAP, /* page metadata allocated through buddy allocator */ - NR_MEMMAP_BOOT, /* page metadata allocated through boot allocator */ NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9ab4fa5e09b5..9eb77c9007e6 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -38,6 +38,8 @@ struct reclaim_stat { enum vm_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, + NR_MEMMAP_PAGES, /* page metadata allocated through buddy allocator */ + NR_MEMMAP_BOOT_PAGES, /* page metadata allocated through boot allocator */ NR_VM_STAT_ITEMS, }; @@ -618,7 +620,6 @@ static inline void lruvec_stat_sub_folio(struct folio *folio, lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } -void __meminit mod_node_early_perpage_metadata(int nid, long delta); -void __meminit store_early_perpage_metadata(void); - +void memmap_boot_pages_add(long delta); +void memmap_pages_add(long delta); #endif /* _LINUX_VMSTAT_H */ -- cgit v1.2.3 From a8fc28dad6d574582cdf2f7e78c73c59c623df30 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 13 Aug 2024 08:07:56 -0700 Subject: alloc_tag: introduce clear_page_tag_ref() helper function In several cases we are freeing pages which were not allocated using common page allocators. For such cases, in order to keep allocation accounting correct, we should clear the page tag to indicate that the page being freed is expected to not have a valid allocation tag. Introduce clear_page_tag_ref() helper function to be used for this. Link: https://lkml.kernel.org/r/20240813150758.855881-1-surenb@google.com Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty") Signed-off-by: Suren Baghdasaryan Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Pasha Tatashin Cc: Kees Cook Cc: Kent Overstreet Cc: Sourav Panda Cc: Vlastimil Babka Cc: [6.10] Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 18cd0c0c73d9..207f0c83c8e9 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -43,6 +43,18 @@ static inline void put_page_tag_ref(union codetag_ref *ref) page_ext_put(page_ext_from_codetag_ref(ref)); } +static inline void clear_page_tag_ref(struct page *page) +{ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } +} + static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) { @@ -126,6 +138,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } static inline void put_page_tag_ref(union codetag_ref *ref) {} +static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -- cgit v1.2.3 From 67c3ca2c5cfe6a50772514e3349b5e7b3b0fac03 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:02 +0300 Subject: net: mscc: ocelot: use ocelot_xmit_get_vlan_info() also for FDMA and register injection Problem description ------------------- On an NXP LS1028A (felix DSA driver) with the following configuration: - ocelot-8021q tagging protocol - VLAN-aware bridge (with STP) spanning at least swp0 and swp1 - 8021q VLAN upper interfaces on swp0 and swp1: swp0.700, swp1.700 - ptp4l on swp0.700 and swp1.700 we see that the ptp4l instances do not see each other's traffic, and they all go to the grand master state due to the ANNOUNCE_RECEIPT_TIMEOUT_EXPIRES condition. Jumping to the conclusion for the impatient ------------------------------------------- There is a zero-day bug in the ocelot switchdev driver in the way it handles VLAN-tagged packet injection. The correct logic already exists in the source code, in function ocelot_xmit_get_vlan_info() added by commit 5ca721c54d86 ("net: dsa: tag_ocelot: set the classified VLAN during xmit"). But it is used only for normal NPI-based injection with the DSA "ocelot" tagging protocol. The other injection code paths (register-based and FDMA-based) roll their own wrong logic. This affects and was noticed on the DSA "ocelot-8021q" protocol because it uses register-based injection. By moving ocelot_xmit_get_vlan_info() to a place that's common for both the DSA tagger and the ocelot switch library, it can also be called from ocelot_port_inject_frame() in ocelot.c. We need to touch the lines with ocelot_ifh_port_set()'s prototype anyway, so let's rename it to something clearer regarding what it does, and add a kernel-doc. ocelot_ifh_set_basic() should do. Investigation notes ------------------- Debugging reveals that PTP event (aka those carrying timestamps, like Sync) frames injected into swp0.700 (but also swp1.700) hit the wire with two VLAN tags: 00000000: 01 1b 19 00 00 00 00 01 02 03 04 05 81 00 02 bc ~~~~~~~~~~~ 00000010: 81 00 02 bc 88 f7 00 12 00 2c 00 00 02 00 00 00 ~~~~~~~~~~~ 00000020: 00 00 00 00 00 00 00 00 00 00 00 01 02 ff fe 03 00000030: 04 05 00 01 00 04 00 00 00 00 00 00 00 00 00 00 00000040: 00 00 The second (unexpected) VLAN tag makes felix_check_xtr_pkt() -> ptp_classify_raw() fail to see these as PTP packets at the link partner's receiving end, and return PTP_CLASS_NONE (because the BPF classifier is not written to expect 2 VLAN tags). The reason why packets have 2 VLAN tags is because the transmission code treats VLAN incorrectly. Neither ocelot switchdev, nor felix DSA, declare the NETIF_F_HW_VLAN_CTAG_TX feature. Therefore, at xmit time, all VLANs should be in the skb head, and none should be in the hwaccel area. This is done by: static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) skb = __vlan_hwaccel_push_inside(skb); return skb; } But ocelot_port_inject_frame() handles things incorrectly: ocelot_ifh_port_set(ifh, port, rew_op, skb_vlan_tag_get(skb)); void ocelot_ifh_port_set(struct sk_buff *skb, void *ifh, int port, u32 rew_op) { (...) if (vlan_tag) ocelot_ifh_set_vlan_tci(ifh, vlan_tag); (...) } The way __vlan_hwaccel_push_inside() pushes the tag inside the skb head is by calling: static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { skb->vlan_present = 0; } which does _not_ zero out skb->vlan_tci as seen by skb_vlan_tag_get(). This means that ocelot, when it calls skb_vlan_tag_get(), sees (and uses) a residual skb->vlan_tci, while the same VLAN tag is _already_ in the skb head. The trivial fix for double VLAN headers is to replace the content of ocelot_ifh_port_set() with: if (skb_vlan_tag_present(skb)) ocelot_ifh_set_vlan_tci(ifh, skb_vlan_tag_get(skb)); but this would not be correct either, because, as mentioned, vlan_hw_offload_capable() is false for us, so we'd be inserting dead code and we'd always transmit packets with VID=0 in the injection frame header. I can't actually test the ocelot switchdev driver and rely exclusively on code inspection, but I don't think traffic from 8021q uppers has ever been injected properly, and not double-tagged. Thus I'm blaming the introduction of VLAN fields in the injection header - early driver code. As hinted at in the early conclusion, what we _want_ to happen for VLAN transmission was already described once in commit 5ca721c54d86 ("net: dsa: tag_ocelot: set the classified VLAN during xmit"). ocelot_xmit_get_vlan_info() intends to ensure that if the port through which we're transmitting is under a VLAN-aware bridge, the outer VLAN tag from the skb head is stripped from there and inserted into the injection frame header (so that the packet is processed in hardware through that actual VLAN). And in all other cases, the packet is sent with VID=0 in the injection frame header, since the port is VLAN-unaware and has logic to strip this VID on egress (making it invisible to the wire). Fixes: 08d02364b12f ("net: mscc: fix the injection header") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/ocelot.h | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index dca2969015d8..6fbfbde68a37 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -5,6 +5,8 @@ #ifndef _NET_DSA_TAG_OCELOT_H #define _NET_DSA_TAG_OCELOT_H +#include +#include #include #include #include @@ -273,4 +275,49 @@ static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb) return rew_op; } +/** + * ocelot_xmit_get_vlan_info: Determine VLAN_TCI and TAG_TYPE for injected frame + * @skb: Pointer to socket buffer + * @br: Pointer to bridge device that the port is under, if any + * @vlan_tci: + * @tag_type: + * + * If the port is under a VLAN-aware bridge, remove the VLAN header from the + * payload and move it into the DSA tag, which will make the switch classify + * the packet to the bridge VLAN. Otherwise, leave the classified VLAN at zero, + * which is the pvid of standalone ports (OCELOT_STANDALONE_PVID), although not + * of VLAN-unaware bridge ports (that would be ocelot_vlan_unaware_pvid()). + * Anyway, VID 0 is fine because it is stripped on egress for these port modes, + * and source address learning is not performed for packets injected from the + * CPU anyway, so it doesn't matter that the VID is "wrong". + */ +static inline void ocelot_xmit_get_vlan_info(struct sk_buff *skb, + struct net_device *br, + u64 *vlan_tci, u64 *tag_type) +{ + struct vlan_ethhdr *hdr; + u16 proto, tci; + + if (!br || !br_vlan_enabled(br)) { + *vlan_tci = 0; + *tag_type = IFH_TAG_TYPE_C; + return; + } + + hdr = (struct vlan_ethhdr *)skb_mac_header(skb); + br_vlan_get_proto(br, &proto); + + if (ntohs(hdr->h_vlan_proto) == proto) { + vlan_remove_tag(skb, &tci); + *vlan_tci = tci; + } else { + rcu_read_lock(); + br_vlan_get_pvid_rcu(br, &tci); + rcu_read_unlock(); + *vlan_tci = tci; + } + + *tag_type = (proto != ETH_P_8021Q) ? IFH_TAG_TYPE_S : IFH_TAG_TYPE_C; +} + #endif -- cgit v1.2.3 From 3942bb49728ad9e1f94d953a88af169a8f5d8099 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 14 Aug 2024 13:00:34 +0300 Subject: string: add mem_is_zero() helper to check if memory area is all zeros Almost two thirds of the memchr_inv() usages check if the memory area is all zeros, with no interest in where in the buffer the first non-zero byte is located. Checking for !memchr_inv(s, 0, n) is also not very intuitive or discoverable. Add an explicit mem_is_zero() helper for this use case. Reviewed-by: Kees Cook Reviewed-by: Andy Shevchenko Link: https://patchwork.freedesktop.org/patch/msgid/20240814100035.3100852-1-jani.nikula@intel.com Signed-off-by: Jani Nikula --- include/linux/string.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 9edace076ddb..5855c5626b4b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -279,6 +279,18 @@ static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) void *memchr_inv(const void *s, int c, size_t n); char *strreplace(char *str, char old, char new); +/** + * mem_is_zero - Check if an area of memory is all 0's. + * @s: The memory area + * @n: The size of the area + * + * Return: True if the area of memory is all 0's. + */ +static inline bool mem_is_zero(const void *s, size_t n) +{ + return !memchr_inv(s, 0, n); +} + extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp) __malloc; -- cgit v1.2.3 From 6e6f58a170ea98e44075b761f2da42a5aec47dfb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 13 Aug 2024 16:29:11 +0200 Subject: thermal: gov_bang_bang: Use governor_data to reduce overhead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After running once, the for_each_trip_desc() loop in bang_bang_manage() is pure needless overhead because it is not going to make any changes unless a new cooling device has been bound to one of the trips in the thermal zone or the system is resuming from sleep. For this reason, make bang_bang_manage() set governor_data for the thermal zone and check it upfront to decide whether or not it needs to do anything. However, governor_data needs to be reset in some cases to let bang_bang_manage() know that it should walk the trips again, so add an .update_tz() callback to the governor and make the core additionally invoke it during system resume. To avoid affecting the other users of that callback unnecessarily, add a special notification reason for system resume, THERMAL_TZ_RESUME, and also pass it to __thermal_zone_device_update() called during system resume for consistency. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Kästle Reviewed-by: Zhang Rui Cc: 6.10+ # 6.10+ Link: https://patch.msgid.link/2285575.iZASKD2KPV@rjwysocki.net --- include/linux/thermal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 25fbf960b474..b86ddca46b9e 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -55,6 +55,7 @@ enum thermal_notify_event { THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */ THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */ THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */ + THERMAL_TZ_RESUME, /* Thermal zone is resuming after system sleep */ }; /** -- cgit v1.2.3 From 81475beb1b5996505a39cd1d9316ce1e668932a2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 15 Aug 2024 16:32:28 +0000 Subject: block: Drop NULL check in bdev_write_zeroes_sectors() Function bdev_get_queue() must not return NULL, so drop the check in bdev_write_zeroes_sectors(). Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Nitesh Shetty Link: https://lore.kernel.org/r/20240815163228.216051-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e85ec73a07d5..b7664d593486 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1296,12 +1296,7 @@ bdev_max_secure_erase_sectors(struct block_device *bdev) static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return q->limits.max_write_zeroes_sectors; - - return 0; + return bdev_get_queue(bdev)->limits.max_write_zeroes_sectors; } static inline bool bdev_nonrot(struct block_device *bdev) -- cgit v1.2.3