From ea72ce5da22806d5713f3ffb39a6d5ae73841f93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Aug 2024 00:29:36 +0200 Subject: x86/kaslr: Expose and use the end of the physical memory address space iounmap() on x86 occasionally fails to unmap because the provided valid ioremap address is not below high_memory. It turned out that this happens due to KASLR. KASLR uses the full address space between PAGE_OFFSET and vaddr_end to randomize the starting points of the direct map, vmalloc and vmemmap regions. It thereby limits the size of the direct map by using the installed memory size plus an extra configurable margin for hot-plug memory. This limitation is done to gain more randomization space because otherwise only the holes between the direct map, vmalloc, vmemmap and vaddr_end would be usable for randomizing. The limited direct map size is not exposed to the rest of the kernel, so the memory hot-plug and resource management related code paths still operate under the assumption that the available address space can be determined with MAX_PHYSMEM_BITS. request_free_mem_region() allocates from (1 << MAX_PHYSMEM_BITS) - 1 downwards. That means the first allocation happens past the end of the direct map and if unlucky this address is in the vmalloc space, which causes high_memory to become greater than VMALLOC_START and consequently causes iounmap() to fail for valid ioremap addresses. MAX_PHYSMEM_BITS cannot be changed for that because the randomization does not align with address bit boundaries and there are other places which actually require to know the maximum number of address bits. All remaining usage sites of MAX_PHYSMEM_BITS have been analyzed and found to be correct. Cure this by exposing the end of the direct map via PHYSMEM_END and use that for the memory hot-plug and resource management related places instead of relying on MAX_PHYSMEM_BITS. In the KASLR case PHYSMEM_END maps to a variable which is initialized by the KASLR initialization and otherwise it is based on MAX_PHYSMEM_BITS as before. To prevent future hickups add a check into add_pages() to catch callers trying to add memory above PHYSMEM_END. Fixes: 0483e1fa6e09 ("x86/mm: Implement ASLR for kernel memory regions") Reported-by: Max Ramanouski Reported-by: Alistair Popple Signed-off-by: Thomas Gleixner Tested-By: Max Ramanouski Tested-by: Alistair Popple Reviewed-by: Dan Williams Reviewed-by: Alistair Popple Reviewed-by: Kees Cook Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/87ed6soy3z.ffs@tglx --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76..b3864156eaa4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -97,6 +97,10 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif +#ifndef PHYSMEM_END +# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +#endif + #include #include -- cgit v1.2.3 From a547a5880cba6f287179135381f1b484b251be31 Mon Sep 17 00:00:00 2001 From: Peter Newman Date: Thu, 22 Aug 2024 12:02:11 -0700 Subject: x86/resctrl: Fix arch_mbm_* array overrun on SNC When using resctrl on systems with Sub-NUMA Clustering enabled, monitoring groups may be allocated RMID values which would overrun the arch_mbm_{local,total} arrays. This is due to inconsistencies in whether the SNC-adjusted num_rmid value or the unadjusted value in resctrl_arch_system_num_rmid_idx() is used. The num_rmid value for the L3 resource is currently: resctrl_arch_system_num_rmid_idx() / snc_nodes_per_l3_cache As a simple fix, make resctrl_arch_system_num_rmid_idx() return the SNC-adjusted, L3 num_rmid value on x86. Fixes: e13db55b5a0d ("x86/resctrl: Introduce snc_nodes_per_l3_cache") Signed-off-by: Peter Newman Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240822190212.1848788-1-peternewman@google.com --- include/linux/resctrl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index b0875b99e811..d94abba1c716 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -248,6 +248,7 @@ struct resctrl_schema { /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); +u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); /* -- cgit v1.2.3 From 1a5caec7f80ca2e659c03f45378ee26915f4eda2 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 30 Aug 2024 07:35:12 -0700 Subject: regulator: core: Stub devm_regulator_bulk_get_const() if !CONFIG_REGULATOR When adding devm_regulator_bulk_get_const() I missed adding a stub for when CONFIG_REGULATOR is not enabled. Under certain conditions (like randconfig testing) this can cause the compiler to reports errors like: error: implicit declaration of function 'devm_regulator_bulk_get_const'; did you mean 'devm_regulator_bulk_get_enable'? Add the stub. Fixes: 1de452a0edda ("regulator: core: Allow drivers to define their init data as const") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408301813.TesFuSbh-lkp@intel.com/ Cc: Neil Armstrong Signed-off-by: Douglas Anderson Link: https://patch.msgid.link/20240830073511.1.Ib733229a8a19fad8179213c05e1af01b51e42328@changeid Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index d986ec13092e..b9ce521910a0 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -452,6 +452,14 @@ static inline int of_regulator_bulk_get_all(struct device *dev, struct device_no return 0; } +static inline int devm_regulator_bulk_get_const( + struct device *dev, int num_consumers, + const struct regulator_bulk_data *in_consumers, + struct regulator_bulk_data **out_consumers) +{ + return 0; +} + static inline int regulator_bulk_enable(int num_consumers, struct regulator_bulk_data *consumers) { -- cgit v1.2.3 From 33f339a1ba54e56bba57ee9a77c71e385ab4825c Mon Sep 17 00:00:00 2001 From: Tze-nan Wu Date: Fri, 30 Aug 2024 16:25:17 +0800 Subject: bpf, net: Fix a potential race in do_sock_getsockopt() There's a potential race when `cgroup_bpf_enabled(CGROUP_GETSOCKOPT)` is false during the execution of `BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN`, but becomes true when `BPF_CGROUP_RUN_PROG_GETSOCKOPT` is called. This inconsistency can lead to `BPF_CGROUP_RUN_PROG_GETSOCKOPT` receiving an "-EFAULT" from `__cgroup_bpf_run_filter_getsockopt(max_optlen=0)`. Scenario shown as below: `process A` `process B` ----------- ------------ BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN enable CGROUP_GETSOCKOPT BPF_CGROUP_RUN_PROG_GETSOCKOPT (-EFAULT) To resolve this, remove the `BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN` macro and directly uses `copy_from_sockptr` to ensure that `max_optlen` is always set before `BPF_CGROUP_RUN_PROG_GETSOCKOPT` is invoked. Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Co-developed-by: Yanghui Li Signed-off-by: Yanghui Li Co-developed-by: Cheng-Jui Wang Signed-off-by: Cheng-Jui Wang Signed-off-by: Tze-nan Wu Acked-by: Stanislav Fomichev Acked-by: Alexei Starovoitov Link: https://patch.msgid.link/20240830082518.23243-1-Tze-nan.Wu@mediatek.com Signed-off-by: Jakub Kicinski --- include/linux/bpf-cgroup.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index fb3c3e7181e6..ce91d9b2acb9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -390,14 +390,6 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, __ret; \ }) -#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ - copy_from_sockptr(&__ret, optlen, sizeof(int)); \ - __ret; \ -}) - #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ max_optlen, retval) \ ({ \ @@ -518,7 +510,6 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) -#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ -- cgit v1.2.3 From 8f62819aaace77dd85037ae766eb767f8c4417ce Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 23 Aug 2024 11:33:23 +0200 Subject: PCI/pwrctl: Rescan bus on a separate thread If we trigger the bus rescan from sysfs, we'll try to lock the PCI rescan mutex recursively and deadlock - the platform device will be populated and probed on the same thread that handles the sysfs write. Add a workqueue to the pwrctl code on which we schedule the rescan for controlled PCI devices. While at it: add a new interface for initializing the pwrctl context where we'd now assign the parent device address and initialize the workqueue. Link: https://lore.kernel.org/r/20240823093323.33450-3-brgl@bgdev.pl Fixes: 4565d2652a37 ("PCI/pwrctl: Add PCI power control core code") Reported-by: Konrad Dybcio Signed-off-by: Bartosz Golaszewski Signed-off-by: Bjorn Helgaas Reviewed-by: Manivannan Sadhasivam --- include/linux/pci-pwrctl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-pwrctl.h b/include/linux/pci-pwrctl.h index 45e9cfe740e4..0d23dddf59ec 100644 --- a/include/linux/pci-pwrctl.h +++ b/include/linux/pci-pwrctl.h @@ -7,6 +7,7 @@ #define __PCI_PWRCTL_H__ #include +#include struct device; struct device_link; @@ -41,8 +42,10 @@ struct pci_pwrctl { /* Private: don't use. */ struct notifier_block nb; struct device_link *link; + struct work_struct work; }; +void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev); int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl); void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl); int devm_pci_pwrctl_device_set_ready(struct device *dev, -- cgit v1.2.3 From 593377036e50de89132bc1222800174fde0780ec Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Fri, 10 May 2024 23:05:56 -0300 Subject: kvm: Note an RCU quiescent state on guest exit As of today, KVM notes a quiescent state only in guest entry, which is good as it avoids the guest being interrupted for current RCU operations. While the guest vcpu runs, it can be interrupted by a timer IRQ that will check for any RCU operations waiting for this CPU. In case there are any of such, it invokes rcu_core() in order to sched-out the current thread and note a quiescent state. This occasional schedule work will introduce tens of microsseconds of latency, which is really bad for vcpus running latency-sensitive applications, such as real-time workloads. So, note a quiescent state in guest exit, so the interrupted guests is able to deal with any pending RCU operations before being required to invoke rcu_core(), and thus avoid the overhead of related scheduler work. Signed-off-by: Leonardo Bras Acked-by: Paul E. McKenney Acked-by: Sean Christopherson Message-ID: <20240511020557.1198200-1-leobras@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/context_tracking.h | 6 ++++-- include/linux/kvm_host.h | 10 +++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 6e76b9dba00e..8a78fabeafc3 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -80,10 +80,12 @@ static __always_inline bool context_tracking_guest_enter(void) return context_tracking_enabled_this_cpu(); } -static __always_inline void context_tracking_guest_exit(void) +static __always_inline bool context_tracking_guest_exit(void) { if (context_tracking_enabled()) __ct_user_exit(CONTEXT_GUEST); + + return context_tracking_enabled_this_cpu(); } #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) @@ -98,7 +100,7 @@ static inline void exception_exit(enum ctx_state prev_ctx) { } static inline int ct_state(void) { return -1; } static inline int __ct_state(void) { return -1; } static __always_inline bool context_tracking_guest_enter(void) { return false; } -static __always_inline void context_tracking_guest_exit(void) { } +static __always_inline bool context_tracking_guest_exit(void) { return false; } #define CT_WARN_ON(cond) do { } while (0) #endif /* !CONFIG_CONTEXT_TRACKING_USER */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b23c6d48392f..0d5125a3e31a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -485,7 +485,15 @@ static __always_inline void guest_state_enter_irqoff(void) */ static __always_inline void guest_context_exit_irqoff(void) { - context_tracking_guest_exit(); + /* + * Guest mode is treated as a quiescent state, see + * guest_context_enter_irqoff() for more details. + */ + if (!context_tracking_guest_exit()) { + instrumentation_begin(); + rcu_virt_note_context_switch(); + instrumentation_end(); + } } /* -- cgit v1.2.3