From 0fb4a7ad270b3b209e510eb9dc5b07bf02b7edaf Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 29 Oct 2024 18:11:46 +0000 Subject: mm: refactor map_deny_write_exec() Refactor the map_deny_write_exec() to not unnecessarily require a VMA parameter but rather to accept VMA flags parameters, which allows us to use this function early in mmap_region() in a subsequent commit. While we're here, we refactor the function to be more readable and add some additional documentation. Link: https://lkml.kernel.org/r/6be8bb59cd7c68006ebb006eb9d8dc27104b1f70.1730224667.git.lorenzo.stoakes@oracle.com Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails") Signed-off-by: Lorenzo Stoakes Reported-by: Jann Horn Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Jann Horn Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Helge Deller Cc: James E.J. Bottomley Cc: Linus Torvalds Cc: Mark Brown Cc: Peter Xu Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- include/linux/mman.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mman.h b/include/linux/mman.h index bcb201ab7a41..8ddca62d6460 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -188,16 +188,31 @@ static inline bool arch_memory_deny_write_exec_supported(void) * * d) mmap(PROT_READ | PROT_EXEC) * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + * + * This is only applicable if the user has set the Memory-Deny-Write-Execute + * (MDWE) protection mask for the current process. + * + * @old specifies the VMA flags the VMA originally possessed, and @new the ones + * we propose to set. + * + * Return: false if proposed change is OK, true if not ok and should be denied. */ -static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags) +static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { + /* If MDWE is disabled, we have nothing to deny. */ if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) return false; - if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE)) + /* If the new VMA is not executable, we have nothing to deny. */ + if (!(new & VM_EXEC)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (new & VM_WRITE) return true; - if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC)) + /* ...nor previously non-executable VMAs becoming executable. */ + if (!(old & VM_EXEC)) return true; return false; -- cgit v1.2.3 From 5baf8b037debf4ec60108ccfeccb8636d1dbad81 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 29 Oct 2024 18:11:47 +0000 Subject: mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling Currently MTE is permitted in two circumstances (desiring to use MTE having been specified by the VM_MTE flag) - where MAP_ANONYMOUS is specified, as checked by arch_calc_vm_flag_bits() and actualised by setting the VM_MTE_ALLOWED flag, or if the file backing the mapping is shmem, in which case we set VM_MTE_ALLOWED in shmem_mmap() when the mmap hook is activated in mmap_region(). The function that checks that, if VM_MTE is set, VM_MTE_ALLOWED is also set is the arm64 implementation of arch_validate_flags(). Unfortunately, we intend to refactor mmap_region() to perform this check earlier, meaning that in the case of a shmem backing we will not have invoked shmem_mmap() yet, causing the mapping to fail spuriously. It is inappropriate to set this architecture-specific flag in general mm code anyway, so a sensible resolution of this issue is to instead move the check somewhere else. We resolve this by setting VM_MTE_ALLOWED much earlier in do_mmap(), via the arch_calc_vm_flag_bits() call. This is an appropriate place to do this as we already check for the MAP_ANONYMOUS case here, and the shmem file case is simply a variant of the same idea - we permit RAM-backed memory. This requires a modification to the arch_calc_vm_flag_bits() signature to pass in a pointer to the struct file associated with the mapping, however this is not too egregious as this is only used by two architectures anyway - arm64 and parisc. So this patch performs this adjustment and removes the unnecessary assignment of VM_MTE_ALLOWED in shmem_mmap(). [akpm@linux-foundation.org: fix whitespace, per Catalin] Link: https://lkml.kernel.org/r/ec251b20ba1964fb64cf1607d2ad80c47f3873df.1730224667.git.lorenzo.stoakes@oracle.com Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails") Signed-off-by: Lorenzo Stoakes Suggested-by: Catalin Marinas Reported-by: Jann Horn Reviewed-by: Catalin Marinas Reviewed-by: Vlastimil Babka Cc: Andreas Larsson Cc: David S. Miller Cc: Helge Deller Cc: James E.J. Bottomley Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Mark Brown Cc: Peter Xu Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- include/linux/mman.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mman.h b/include/linux/mman.h index 8ddca62d6460..a842783ffa62 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MMAN_H #define _LINUX_MMAN_H +#include #include #include @@ -94,7 +95,7 @@ static inline void vm_unacct_memory(long pages) #endif #ifndef arch_calc_vm_flag_bits -#define arch_calc_vm_flag_bits(flags) 0 +#define arch_calc_vm_flag_bits(file, flags) 0 #endif #ifndef arch_validate_prot @@ -151,13 +152,13 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey) * Combine the mmap "flags" argument into "vm_flags" used internally. */ static inline unsigned long -calc_vm_flag_bits(unsigned long flags) +calc_vm_flag_bits(struct file *file, unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) | - arch_calc_vm_flag_bits(flags); + arch_calc_vm_flag_bits(file, flags); } unsigned long vm_commit_limit(void); -- cgit v1.2.3 From 44d0469f79bd3d0b3433732877358df7dc6b17b1 Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Wed, 6 Nov 2024 00:37:42 +0000 Subject: bpf: Add sk_is_inet and IS_ICSK check in tls_sw_has_ctx_tx/rx As the introduction of the support for vsock and unix sockets in sockmap, tls_sw_has_ctx_tx/rx cannot presume the socket passed in must be IS_ICSK. vsock and af_unix sockets have vsock_sock and unix_sock instead of inet_connection_sock. For these sockets, tls_get_ctx may return an invalid pointer and cause page fault in function tls_sw_ctx_rx. BUG: unable to handle page fault for address: 0000000000040030 Workqueue: vsock-loopback vsock_loopback_work RIP: 0010:sk_psock_strp_data_ready+0x23/0x60 Call Trace: ? __die+0x81/0xc3 ? no_context+0x194/0x350 ? do_page_fault+0x30/0x110 ? async_page_fault+0x3e/0x50 ? sk_psock_strp_data_ready+0x23/0x60 virtio_transport_recv_pkt+0x750/0x800 ? update_load_avg+0x7e/0x620 vsock_loopback_work+0xd0/0x100 process_one_work+0x1a7/0x360 worker_thread+0x30/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x112/0x130 ? __kthread_cancel_work+0x40/0x40 ret_from_fork+0x1f/0x40 v2: - Add IS_ICSK check v3: - Update the commits in Fixes Fixes: 634f1a7110b4 ("vsock: support sockmap") Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") Signed-off-by: Zijian Zhang Acked-by: Stanislav Fomichev Acked-by: Jakub Kicinski Reviewed-by: Cong Wang Acked-by: Stefano Garzarella Link: https://lore.kernel.org/r/20241106003742.399240-1-zijianzhang@bytedance.com Signed-off-by: Martin KaFai Lau --- include/net/tls.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 3a33924db2bc..61fef2880114 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -390,8 +390,12 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) static inline bool tls_sw_has_ctx_tx(const struct sock *sk) { - struct tls_context *ctx = tls_get_ctx(sk); + struct tls_context *ctx; + + if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk)) + return false; + ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_tx(ctx); @@ -399,8 +403,12 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk) static inline bool tls_sw_has_ctx_rx(const struct sock *sk) { - struct tls_context *ctx = tls_get_ctx(sk); + struct tls_context *ctx; + + if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk)) + return false; + ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_rx(ctx); -- cgit v1.2.3 From b79276dcac9124a79c8cf7cc8fbdd3d4c3c9a7c7 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 4 Nov 2024 16:28:55 -0600 Subject: ACPI: processor: Move arch_init_invariance_cppc() call later arch_init_invariance_cppc() is called at the end of acpi_cppc_processor_probe() in order to configure frequency invariance based upon the values from _CPC. This however doesn't work on AMD CPPC shared memory designs that have AMD preferred cores enabled because _CPC needs to be analyzed from all cores to judge if preferred cores are enabled. This issue manifests to users as a warning since commit 21fb59ab4b97 ("ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn"): ``` Could not retrieve highest performance (-19) ``` However the warning isn't the cause of this, it was actually commit 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()") which exposed the issue. To fix this problem, change arch_init_invariance_cppc() into a new weak symbol that is called at the end of acpi_processor_driver_init(). Each architecture that supports it can declare the symbol to override the weak one. Define it for x86, in arch/x86/kernel/acpi/cppc.c, and for all of the architectures using the generic arch_topology.c code. Fixes: 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()") Reported-by: Ivan Shapovalov Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219431 Tested-by: Oleksandr Natalenko Signed-off-by: Mario Limonciello Link: https://patch.msgid.link/20241104222855.3959267-1-superm1@kernel.org [ rjw: Changelog edit ] Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 2 ++ include/linux/arch_topology.h | 4 ---- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index e6f6074eadbf..a17e97e634a6 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -465,4 +465,6 @@ extern int acpi_processor_ffh_lpi_probe(unsigned int cpu); extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); #endif +void acpi_processor_init_invariance_cppc(void); + #endif diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index b721f360d759..4a952c4885ed 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -11,10 +11,6 @@ void topology_normalize_cpu_scale(void); int topology_update_cpu_topology(void); -#ifdef CONFIG_ACPI_CPPC_LIB -void topology_init_cpu_capacity_cppc(void); -#endif - struct device_node; bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu); -- cgit v1.2.3 From 8c462d56487e3abdbf8a61cedfe7c795a54f4a78 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 6 Nov 2024 16:04:48 +0000 Subject: arm64: smccc: Remove broken support for SMCCCv1.3 SVE discard hint SMCCCv1.3 added a hint bit which callers can set in an SMCCC function ID (AKA "FID") to indicate that it is acceptable for the SMCCC implementation to discard SVE and/or SME state over a specific SMCCC call. The kernel support for using this hint is broken and SMCCC calls may clobber the SVE and/or SME state of arbitrary tasks, though FPSIMD state is unaffected. The kernel support is intended to use the hint when there is no SVE or SME state to save, and to do this it checks whether TIF_FOREIGN_FPSTATE is set or TIF_SVE is clear in assembly code: | ldr , [, #TSK_TI_FLAGS] | tbnz , #TIF_FOREIGN_FPSTATE, 1f // Any live FP state? | tbnz , #TIF_SVE, 2f // Does that state include SVE? | | 1: orr , , ARM_SMCCC_1_3_SVE_HINT | 2: | << SMCCC call using FID >> This is not safe as-is: (1) SMCCC calls can be made in a preemptible context and preemption can result in TIF_FOREIGN_FPSTATE being set or cleared at arbitrary points in time. Thus checking for TIF_FOREIGN_FPSTATE provides no guarantee. (2) TIF_FOREIGN_FPSTATE only indicates that the live FP/SVE/SME state in the CPU does not belong to the current task, and does not indicate that clobbering this state is acceptable. When the live CPU state is clobbered it is necessary to update fpsimd_last_state.st to ensure that a subsequent context switch will reload FP/SVE/SME state from memory rather than consuming the clobbered state. This and the SMCCC call itself must happen in a critical section with preemption disabled to avoid races. (3) Live SVE/SME state can exist with TIF_SVE clear (e.g. with only TIF_SME set), and checking TIF_SVE alone is insufficient. Remove the broken support for the SMCCCv1.3 SVE saving hint. This is effectively a revert of commits: * cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint") * a7c3acca5380 ("arm64: smccc: Save lr before calling __arm_smccc_sve_check()") ... leaving behind the ARM_SMCCC_VERSION_1_3 and ARM_SMCCC_1_3_SVE_HINT definitions, since these are simply definitions from the SMCCC specification, and the latter is used in KVM via ARM_SMCCC_CALL_HINTS. If we want to bring this back in future, we'll probably want to handle this logic in C where we can use all the usual FPSIMD/SVE/SME helper functions, and that'll likely require some rework of the SMCCC code and/or its callers. Fixes: cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint") Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Marc Zyngier Cc: Mark Brown Cc: Will Deacon Cc: stable@vger.kernel.org Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20241106160448.2712997-1-mark.rutland@arm.com Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 32 +++----------------------------- 1 file changed, 3 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index f59099a213d0..67f6fdf2e7cd 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -315,8 +315,6 @@ u32 arm_smccc_get_version(void); void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit); -extern u64 smccc_has_sve_hint; - /** * arm_smccc_get_soc_id_version() * @@ -414,15 +412,6 @@ struct arm_smccc_quirk { } state; }; -/** - * __arm_smccc_sve_check() - Set the SVE hint bit when doing SMC calls - * - * Sets the SMCCC hint bit to indicate if there is live state in the SVE - * registers, this modifies x0 in place and should never be called from C - * code. - */ -asmlinkage unsigned long __arm_smccc_sve_check(unsigned long x0); - /** * __arm_smccc_smc() - make SMC calls * @a0-a7: arguments passed in registers 0 to 7 @@ -490,20 +479,6 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, #endif -/* nVHE hypervisor doesn't have a current thread so needs separate checks */ -#if defined(CONFIG_ARM64_SVE) && !defined(__KVM_NVHE_HYPERVISOR__) - -#define SMCCC_SVE_CHECK ALTERNATIVE("nop \n", "bl __arm_smccc_sve_check \n", \ - ARM64_SVE) -#define smccc_sve_clobbers "x16", "x30", "cc", - -#else - -#define SMCCC_SVE_CHECK -#define smccc_sve_clobbers - -#endif - #define __constraint_read_2 "r" (arg0) #define __constraint_read_3 __constraint_read_2, "r" (arg1) #define __constraint_read_4 __constraint_read_3, "r" (arg2) @@ -574,12 +549,11 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, register unsigned long r3 asm("r3"); \ CONCATENATE(__declare_arg_, \ COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__); \ - asm volatile(SMCCC_SVE_CHECK \ - inst "\n" : \ + asm volatile(inst "\n" : \ "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3) \ : CONCATENATE(__constraint_read_, \ COUNT_ARGS(__VA_ARGS__)) \ - : smccc_sve_clobbers "memory"); \ + : "memory"); \ if (___res) \ *___res = (typeof(*___res)){r0, r1, r2, r3}; \ } while (0) @@ -628,7 +602,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, asm ("" : \ : CONCATENATE(__constraint_read_, \ COUNT_ARGS(__VA_ARGS__)) \ - : smccc_sve_clobbers "memory"); \ + : "memory"); \ if (___res) \ ___res->a0 = SMCCC_RET_NOT_SUPPORTED; \ } while (0) -- cgit v1.2.3 From c928807f6f6b6d595a7e199591ae297c81de3aeb Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 28 Oct 2024 12:26:53 -0600 Subject: mm/page_alloc: keep track of free highatomic OOM kills due to vastly overestimated free highatomic reserves were observed: ... invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0 ... Node 0 Normal free:1482936kB boost:0kB min:410416kB low:739404kB high:1068392kB reserved_highatomic:1073152KB ... Node 0 Normal: 1292*4kB (ME) 1920*8kB (E) 383*16kB (UE) 220*32kB (ME) 340*64kB (E) 2155*128kB (UE) 3243*256kB (UE) 615*512kB (U) 1*1024kB (M) 0*2048kB 0*4096kB = 1477408kB The second line above shows that the OOM kill was due to the following condition: free (1482936kB) - reserved_highatomic (1073152kB) = 409784KB < min (410416kB) And the third line shows there were no free pages in any MIGRATE_HIGHATOMIC pageblocks, which otherwise would show up as type 'H'. Therefore __zone_watermark_unusable_free() underestimated the usable free memory by over 1GB, which resulted in the unnecessary OOM kill above. The comments in __zone_watermark_unusable_free() warns about the potential risk, i.e., If the caller does not have rights to reserves below the min watermark then subtract the high-atomic reserves. This will over-estimate the size of the atomic reserve but it avoids a search. However, it is possible to keep track of free pages in reserved highatomic pageblocks with a new per-zone counter nr_free_highatomic protected by the zone lock, to avoid a search when calculating the usable free memory. And the cost would be minimal, i.e., simple arithmetics in the highatomic alloc/free/move paths. Note that since nr_free_highatomic can be relatively small, using a per-cpu counter might cause too much drift and defeat its purpose, in addition to the extra memory overhead. Dependson e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting") - see [1] [akpm@linux-foundation.org: s/if/else if/, per Johannes, stealth whitespace tweak] Link: https://lkml.kernel.org/r/20241028182653.3420139-1-yuzhao@google.com Link: https://lkml.kernel.org/r/0d0ddb33-fcdc-43e2-801f-0c1df2031afb@suse.cz [1] Fixes: 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand") Signed-off-by: Yu Zhao Reported-by: Link Lin Acked-by: David Rientjes Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5b1c984daf45..80bc5640bb60 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -823,6 +823,7 @@ struct zone { unsigned long watermark_boost; unsigned long nr_reserved_highatomic; + unsigned long nr_free_highatomic; /* * We don't know if the memory that we're going to allocate will be -- cgit v1.2.3 From 9e05e5c7ee8758141d2db7e8fea2cab34500c6ed Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 4 Nov 2024 19:54:19 +0000 Subject: signal: restore the override_rlimit logic Prior to commit d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts") UCOUNT_RLIMIT_SIGPENDING rlimit was not enforced for a class of signals. However now it's enforced unconditionally, even if override_rlimit is set. This behavior change caused production issues. For example, if the limit is reached and a process receives a SIGSEGV signal, sigqueue_alloc fails to allocate the necessary resources for the signal delivery, preventing the signal from being delivered with siginfo. This prevents the process from correctly identifying the fault address and handling the error. From the user-space perspective, applications are unaware that the limit has been reached and that the siginfo is effectively 'corrupted'. This can lead to unpredictable behavior and crashes, as we observed with java applications. Fix this by passing override_rlimit into inc_rlimit_get_ucounts() and skip the comparison to max there if override_rlimit is set. This effectively restores the old behavior. Link: https://lkml.kernel.org/r/20241104195419.3962584-1-roman.gushchin@linux.dev Fixes: d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts") Signed-off-by: Roman Gushchin Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Acked-by: Oleg Nesterov Acked-by: Alexey Gladkov Cc: Kees Cook Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton --- include/linux/user_namespace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 3625096d5f85..7183e5aca282 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -141,7 +141,8 @@ static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type ty long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); -long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type); +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, + bool override_rlimit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); -- cgit v1.2.3 From e7ac4daeed91a25382091e73818ea0cddb1afd5e Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 7 Nov 2024 14:12:46 +1300 Subject: mm: count zeromap read and set for swapout and swapin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the proportion of folios from the zeromap is small, missing their accounting may not significantly impact profiling. However, it's easy to construct a scenario where this becomes an issue—for example, allocating 1 GB of memory, writing zeros from userspace, followed by MADV_PAGEOUT, and then swapping it back in. In this case, the swap-out and swap-in counts seem to vanish into a black hole, potentially causing semantic ambiguity. On the other hand, Usama reported that zero-filled pages can exceed 10% in workloads utilizing zswap, while Hailong noted that some app in Android have more than 6% zero-filled pages. Before commit 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap"), both zswap and zRAM implemented similar optimizations, leading to these optimized-out pages being counted in either zswap or zRAM counters (with pswpin/pswpout also increasing for zRAM). With zeromap functioning prior to both zswap and zRAM, userspace will no longer detect these swap-out and swap-in actions. We have three ways to address this: 1. Introduce a dedicated counter specifically for the zeromap. 2. Use pswpin/pswpout accounting, treating the zero map as a standard backend. This approach aligns with zRAM's current handling of same-page fills at the device level. However, it would mean losing the optimized-out page counters previously available in zRAM and would not align with systems using zswap. Additionally, as noted by Nhat Pham, pswpin/pswpout counters apply only to I/O done directly to the backend device. 3. Count zeromap pages under zswap, aligning with system behavior when zswap is enabled. However, this would not be consistent with zRAM, nor would it align with systems lacking both zswap and zRAM. Given the complications with options 2 and 3, this patch selects option 1. We can find these counters from /proc/vmstat (counters for the whole system) and memcg's memory.stat (counters for the interested memcg). For example: $ grep -E 'swpin_zero|swpout_zero' /proc/vmstat swpin_zero 1648 swpout_zero 33536 $ grep -E 'swpin_zero|swpout_zero' /sys/fs/cgroup/system.slice/memory.stat swpin_zero 3905 swpout_zero 3985 This patch does not address any specific zeromap bug, but the missing swpout and swpin counts for zero-filled pages can be highly confusing and may mislead user-space agents that rely on changes in these counters as indicators. Therefore, we add a Fixes tag to encourage the inclusion of this counter in any kernel versions with zeromap. Many thanks to Kanchana for the contribution of changing count_objcg_event() to count_objcg_events() to support large folios[1], which has now been incorporated into this patch. [1] https://lkml.kernel.org/r/20241001053222.6944-5-kanchana.p.sridhar@intel.com Link: https://lkml.kernel.org/r/20241107011246.59137-1-21cnbao@gmail.com Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap") Co-developed-by: Kanchana P Sridhar Signed-off-by: Barry Song Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Cc: Usama Arif Cc: Yosry Ahmed Cc: Hailong Liu Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Cc: Andi Kleen Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Kairui Song Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 +++++++----- include/linux/vm_event_item.h | 2 ++ 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 34d2da05f2f1..e1b41554a5fb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1760,8 +1760,9 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); -static inline void count_objcg_event(struct obj_cgroup *objcg, - enum vm_event_item idx) +static inline void count_objcg_events(struct obj_cgroup *objcg, + enum vm_event_item idx, + unsigned long count) { struct mem_cgroup *memcg; @@ -1770,7 +1771,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); - count_memcg_events(memcg, idx, 1); + count_memcg_events(memcg, idx, count); rcu_read_unlock(); } @@ -1825,8 +1826,9 @@ static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) return NULL; } -static inline void count_objcg_event(struct obj_cgroup *objcg, - enum vm_event_item idx) +static inline void count_objcg_events(struct obj_cgroup *objcg, + enum vm_event_item idx, + unsigned long count) { } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index aed952d04132..f70d0958095c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -134,6 +134,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, + SWPIN_ZERO, + SWPOUT_ZERO, #ifdef CONFIG_KSM KSM_SWPIN_COPY, #endif -- cgit v1.2.3 From 27184f8905ba680f22abf1707fbed24036a67119 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 13 Nov 2024 07:54:12 +0200 Subject: tpm: Opt-in in disable PCR integrity protection The initial HMAC session feature added TPM bus encryption and/or integrity protection to various in-kernel TPM operations. This can cause performance bottlenecks with IMA, as it heavily utilizes PCR extend operations. In order to mitigate this performance issue, introduce a kernel command-line parameter to the TPM driver for disabling the integrity protection for PCR extend operations (i.e. TPM2_PCR_Extend). Cc: James Bottomley Link: https://lore.kernel.org/linux-integrity/20241015193916.59964-1-zohar@linux.ibm.com/ Fixes: 6519fea6fd37 ("tpm: add hmac checks to tpm2_pcr_extend()") Tested-by: Mimi Zohar Co-developed-by: Roberto Sassu Signed-off-by: Roberto Sassu Co-developed-by: Mimi Zohar Signed-off-by: Mimi Zohar Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 587b96b4418e..20a40ade8030 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -421,6 +421,7 @@ void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value); u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset); u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset); u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset); +void tpm_buf_append_handle(struct tpm_chip *chip, struct tpm_buf *buf, u32 handle); /* * Check if TPM device is in the firmware upgrade mode. @@ -505,6 +506,8 @@ void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf, void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, u8 *passphrase, int passphraselen); +void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf, + u8 attributes, u8 *passphrase, int passphraselen); static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, -- cgit v1.2.3 From eb94b7bb10109a14a5431a67e5d8e31cfa06b395 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 11 Nov 2024 00:17:34 +0100 Subject: net: Make copy_safe_from_sockptr() match documentation copy_safe_from_sockptr() return copy_from_sockptr() return copy_from_sockptr_offset() return copy_from_user() copy_from_user() does not return an error on fault. Instead, it returns a number of bytes that were not copied. Have it handled. Patch has a side effect: it un-breaks garbage input handling of nfc_llcp_setsockopt() and mISDN's data_sock_setsockopt(). Fixes: 6309863b31dd ("net: add copy_safe_from_sockptr() helper") Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241111-sockptr-copy-ret-fix-v1-1-a520083a93fb@rbox.co Signed-off-by: Jakub Kicinski --- include/linux/sockptr.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index fc5a206c4043..195debe2b1db 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -77,7 +77,9 @@ static inline int copy_safe_from_sockptr(void *dst, size_t ksize, { if (optlen < ksize) return -EINVAL; - return copy_from_sockptr(dst, optval, ksize); + if (copy_from_sockptr(dst, optval, ksize)) + return -EFAULT; + return 0; } static inline int copy_struct_from_sockptr(void *dst, size_t ksize, -- cgit v1.2.3 From 8eb36164d1a6769a20ed43033510067ff3dab9ee Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 11 Nov 2024 10:16:49 +0000 Subject: bonding: add ns target multicast address to slave device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4598380f9c54 ("bonding: fix ns validation on backup slaves") tried to resolve the issue where backup slaves couldn't be brought up when receiving IPv6 Neighbor Solicitation (NS) messages. However, this fix only worked for drivers that receive all multicast messages, such as the veth interface. For standard drivers, the NS multicast message is silently dropped because the slave device is not a member of the NS target multicast group. To address this, we need to make the slave device join the NS target multicast group, ensuring it can receive these IPv6 NS messages to validate the slave’s status properly. There are three policies before joining the multicast group: 1. All settings must be under active-backup mode (alb and tlb do not support arp_validate), with backup slaves and slaves supporting multicast. 2. We can add or remove multicast groups when arp_validate changes. 3. Other operations, such as enslaving, releasing, or setting NS targets, need to be guarded by arp_validate. Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Signed-off-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- include/net/bond_options.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 473a0147769e..18687ccf0638 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -161,5 +161,7 @@ void bond_option_arp_ip_targets_clear(struct bonding *bond); #if IS_ENABLED(CONFIG_IPV6) void bond_option_ns_ip6_targets_clear(struct bonding *bond); #endif +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave); +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave); #endif /* _NET_BOND_OPTIONS_H */ -- cgit v1.2.3