From f03b296e8b516dbd63f57fc9056c1b0da1b9a0ff Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 14 Oct 2024 21:27:58 +0200 Subject: fs: pass offset and result to backing_file end_write() callback This is needed for extending fuse inode size after fuse passthrough write. Suggested-by: Miklos Szeredi Link: https://lore.kernel.org/linux-fsdevel/CAJfpegs=cvZ_NYy6Q_D42XhYS=Sjj5poM1b5TzXzOVvX=R36aA@mail.gmail.com/ Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- include/linux/backing-file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 4b61b0e57720..2eed0ffb5e8f 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -16,7 +16,7 @@ struct backing_file_ctx { const struct cred *cred; struct file *user_file; void (*accessed)(struct file *); - void (*end_write)(struct file *); + void (*end_write)(struct file *, loff_t, ssize_t); }; struct file *backing_file_open(const struct path *user_path, int flags, -- cgit v1.2.3 From 6fad274f06f038c29660aa53fbad14241c9fd976 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 21 Oct 2024 17:28:05 +0200 Subject: bpf: Add MEM_WRITE attribute Add a MEM_WRITE attribute for BPF helper functions which can be used in bpf_func_proto to annotate an argument type in order to let the verifier know that the helper writes into the memory passed as an argument. In the past MEM_UNINIT has been (ab)used for this function, but the latter merely tells the verifier that the passed memory can be uninitialized. There have been bugs with overloading the latter but aside from that there are also cases where the passed memory is read + written which currently cannot be expressed, see also 4b3786a6c539 ("bpf: Zero former ARG_PTR_TO_{LONG,INT} args in case of error"). Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241021152809.33343-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 19d8ca8ac960..bdadb0bb6cec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -635,6 +635,7 @@ enum bpf_type_flag { */ PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), + /* MEM can be uninitialized. */ MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), /* DYNPTR points to memory local to the bpf program. */ @@ -700,6 +701,13 @@ enum bpf_type_flag { */ MEM_ALIGNED = BIT(17 + BPF_BASE_TYPE_BITS), + /* MEM is being written to, often combined with MEM_UNINIT. Non-presence + * of MEM_WRITE means that MEM is only being read. MEM_WRITE without the + * MEM_UNINIT means that memory needs to be initialized since it is also + * read. + */ + MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -758,10 +766,10 @@ enum bpf_arg_type { ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, - /* pointer to memory does not need to be initialized, helper function must fill - * all bytes or clear them in error case. + /* Pointer to memory does not need to be initialized, since helper function + * fills all bytes or clears them in error case. */ - ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | ARG_PTR_TO_MEM, + ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | MEM_WRITE | ARG_PTR_TO_MEM, /* Pointer to valid memory of size known at compile time. */ ARG_PTR_TO_FIXED_SIZE_MEM = MEM_FIXED_SIZE | ARG_PTR_TO_MEM, -- cgit v1.2.3 From c2f803052bc7a7feb2e03befccc8e49b6ff1f5f5 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 24 Oct 2024 09:35:57 +0800 Subject: bpf: Add the missing BPF_LINK_TYPE invocation for sockmap There is an out-of-bounds read in bpf_link_show_fdinfo() for the sockmap link fd. Fix it by adding the missing BPF_LINK_TYPE invocation for sockmap link Also add comments for bpf_link_type to prevent missing updates in the future. Fixes: 699c23f02c65 ("bpf: Add bpf_link support for sk_msg and sk_skb progs") Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241024013558.1135167-2-houtao@huaweicloud.com --- include/linux/bpf_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9f2a6b83b49e..fa78f49d4a9a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -146,6 +146,7 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) BPF_LINK_TYPE(BPF_LINK_TYPE_NETFILTER, netfilter) BPF_LINK_TYPE(BPF_LINK_TYPE_TCX, tcx) BPF_LINK_TYPE(BPF_LINK_TYPE_NETKIT, netkit) +BPF_LINK_TYPE(BPF_LINK_TYPE_SOCKMAP, sockmap) #endif #ifdef CONFIG_PERF_EVENTS BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) -- cgit v1.2.3 From f64e67e5d3a45a4a04286c47afade4b518acd47b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 15 Oct 2024 18:56:05 +0100 Subject: fork: do not invoke uffd on fork if error occurs Patch series "fork: do not expose incomplete mm on fork". During fork we may place the virtual memory address space into an inconsistent state before the fork operation is complete. In addition, we may encounter an error during the fork operation that indicates that the virtual memory address space is invalidated. As a result, we should not be exposing it in any way to external machinery that might interact with the mm or VMAs, machinery that is not designed to deal with incomplete state. We specifically update the fork logic to defer khugepaged and ksm to the end of the operation and only to be invoked if no error arose, and disallow uffd from observing fork events should an error have occurred. This patch (of 2): Currently on fork we expose the virtual address space of a process to userland unconditionally if uffd is registered in VMAs, regardless of whether an error arose in the fork. This is performed in dup_userfaultfd_complete() which is invoked unconditionally, and performs two duties - invoking registered handlers for the UFFD_EVENT_FORK event via dup_fctx(), and clearing down userfaultfd_fork_ctx objects established in dup_userfaultfd(). This is problematic, because the virtual address space may not yet be correctly initialised if an error arose. The change in commit d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") makes this more pertinent as we may be in a state where entries in the maple tree are not yet consistent. We address this by, on fork error, ensuring that we roll back state that we would otherwise expect to clean up through the event being handled by userland and perform the memory freeing duty otherwise performed by dup_userfaultfd_complete(). We do this by implementing a new function, dup_userfaultfd_fail(), which performs the same loop, only decrementing reference counts. Note that we perform mmgrab() on the parent and child mm's, however userfaultfd_ctx_put() will mmdrop() this once the reference count drops to zero, so we will avoid memory leaks correctly here. Link: https://lkml.kernel.org/r/cover.1729014377.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/d3691d58bb58712b6fb3df2be441d175bd3cdf07.1729014377.git.lorenzo.stoakes@oracle.com Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") Signed-off-by: Lorenzo Stoakes Reported-by: Jann Horn Reviewed-by: Jann Horn Reviewed-by: Liam R. Howlett Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Linus Torvalds Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9fc6ce15c499..cb40f1a1d081 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -249,6 +249,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); +void dup_userfaultfd_fail(struct list_head *); extern void mremap_userfaultfd_prep(struct vm_area_struct *, struct vm_userfaultfd_ctx *); @@ -351,6 +352,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l) { } +static inline void dup_userfaultfd_fail(struct list_head *l) +{ +} + static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *ctx) { -- cgit v1.2.3 From 985da552a98e27096444508ce5d853244019111f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 15 Oct 2024 18:56:06 +0100 Subject: fork: only invoke khugepaged, ksm hooks if no error There is no reason to invoke these hooks early against an mm that is in an incomplete state. The change in commit d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") makes this more pertinent as we may be in a state where entries in the maple tree are not yet consistent. Their placement early in dup_mmap() only appears to have been meaningful for early error checking, and since functionally it'd require a very small allocation to fail (in practice 'too small to fail') that'd only occur in the most dire circumstances, meaning the fork would fail or be OOM'd in any case. Since both khugepaged and KSM tracking are there to provide optimisations to memory performance rather than critical functionality, it doesn't really matter all that much if, under such dire memory pressure, we fail to register an mm with these. As a result, we follow the example of commit d2081b2bf819 ("mm: khugepaged: make khugepaged_enter() void function") and make ksm_fork() a void function also. We only expose the mm to these functions once we are done with them and only if no error occurred in the fork operation. Link: https://lkml.kernel.org/r/e0cb8b840c9d1d5a6e84d4f8eff5f3f2022aa10c.1729014377.git.lorenzo.stoakes@oracle.com Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") Signed-off-by: Lorenzo Stoakes Reported-by: Jann Horn Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Jann Horn Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Linus Torvalds Cc: Signed-off-by: Andrew Morton --- include/linux/ksm.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 11690dacd986..ec9c05044d4f 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -54,12 +54,11 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) return atomic_long_read(&mm->ksm_zero_pages); } -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { + /* Adding mm to ksm is best effort on fork. */ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) - return __ksm_enter(mm); - - return 0; + __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) @@ -107,9 +106,8 @@ static inline int ksm_disable(struct mm_struct *mm) return 0; } -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - return 0; } static inline int ksm_execve(struct mm_struct *mm) -- cgit v1.2.3 From 62a898b07b83f6f407003d8a70f0827a5af08a59 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 30 Oct 2024 18:05:13 +0800 Subject: bpf: Add bpf_mem_alloc_check_size() helper Introduce bpf_mem_alloc_check_size() to check whether the allocation size exceeds the limitation for the kmalloc-equivalent allocator. The upper limit for percpu allocation is LLIST_NODE_SZ bytes larger than non-percpu allocation, so a percpu argument is added to the helper. The helper will be used in the following patch to check whether the size parameter passed to bpf_mem_alloc() is too big. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241030100516.3633640-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index aaf004d94322..e45162ef59bb 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -33,6 +33,9 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); +/* Check the allocation size for kmalloc equivalent allocator */ +int bpf_mem_alloc_check_size(bool percpu, size_t size); + /* kmalloc/kfree equivalent: */ void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); -- cgit v1.2.3 From 0fc810ae3ae110f9e2fcccce80fc8c8d62f97907 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Oct 2024 16:03:31 -1000 Subject: x86/uaccess: Avoid barrier_nospec() in 64-bit copy_from_user() The barrier_nospec() in 64-bit copy_from_user() is slow. Instead use pointer masking to force the user pointer to all 1's for an invalid address. The kernel test robot reports a 2.6% improvement in the per_thread_ops benchmark [1]. This is a variation on a patch originally by Josh Poimboeuf [2]. Link: https://lore.kernel.org/202410281344.d02c72a2-oliver.sang@intel.com [1] Link: https://lore.kernel.org/5b887fe4c580214900e21f6c61095adf9a142735.1730166635.git.jpoimboe@kernel.org [2] Tested-and-reviewed-by: Josh Poimboeuf Cc: Kirill A. Shutemov Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 39c7cf82b0c2..43844510d5d0 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -38,6 +38,7 @@ #else #define can_do_masked_user_access() 0 #define masked_user_access_begin(src) NULL + #define mask_user_address(src) (src) #endif /* @@ -159,19 +160,27 @@ _inline_copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); - if (!should_fail_usercopy() && likely(access_ok(from, n))) { + if (should_fail_usercopy()) + goto fail; + if (can_do_masked_user_access()) + from = mask_user_address(from); + else { + if (!access_ok(from, n)) + goto fail; /* * Ensure that bad access_ok() speculation will not * lead to nasty side effects *after* the copy is * finished: */ barrier_nospec(); - instrument_copy_from_user_before(to, from, n); - res = raw_copy_from_user(to, from, n); - instrument_copy_from_user_after(to, from, n, res); } - if (unlikely(res)) - memset(to + (n - res), 0, res); + instrument_copy_from_user_before(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + if (likely(!res)) + return 0; +fail: + memset(to + (n - res), 0, res); return res; } extern __must_check unsigned long -- cgit v1.2.3