From 0e566c8f0f2e8325e35f6f97e13cde5356b41814 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Jul 2021 17:26:47 +0300 Subject: virtio: Protect vqs list access VQs may be accessed to mark the device broken while they are created/destroyed. Hence protect the access to the vqs list. Fixes: e2dcdfe95c0b ("virtio: virtio_break_device() to mark all virtqueues broken.") Signed-off-by: Parav Pandit Link: https://lore.kernel.org/r/20210721142648.1525924-4-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b1894e0323fa..41edbc01ffa4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -110,6 +110,7 @@ struct virtio_device { bool config_enabled; bool config_change_pending; spinlock_t config_lock; + spinlock_t vqs_list_lock; /* Protects VQs list access */ struct device dev; struct virtio_device_id id; const struct virtio_config_ops *config; -- cgit v1.2.3 From c8d182bd387a09a8b95303c8086238e8bf61fcfc Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 15 Jul 2021 16:00:26 +0800 Subject: vdpa: Add documentation for vdpa_alloc_device() macro The return value of vdpa_alloc_device() macro is not very clear, so that most of callers did the wrong check. Let's add some comments to better document it. Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210715080026.242-4-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- include/linux/vdpa.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 3357ac98878d..8cfe49d201dd 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -277,6 +277,17 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, size_t size, const char *name); +/** + * vdpa_alloc_device - allocate and initilaize a vDPA device + * + * @dev_struct: the type of the parent structure + * @member: the name of struct vdpa_device within the @dev_struct + * @parent: the parent device + * @config: the bus operations that is supported by this device + * @name: name of the vdpa device + * + * Return allocated data structure or ERR_PTR upon error + */ #define vdpa_alloc_device(dev_struct, member, parent, config, name) \ container_of(__vdpa_alloc_device( \ parent, config, \ -- cgit v1.2.3 From ea2f6af16532511eb1cd8eb62845c37861f24ce8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 10 Aug 2021 12:25:05 -0400 Subject: vringh: pull in spinlock header we use a spinlock now pull in the correct header to make vring.h self sufficient. Signed-off-by: Michael S. Tsirkin --- include/linux/vringh.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 84db7b8f912f..212892cf9822 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -14,6 +14,7 @@ #include #include #include +#include #if IS_REACHABLE(CONFIG_VHOST_IOTLB) #include #include -- cgit v1.2.3 From 879753c816dbbdb2a9a395aa4448d29feee92d1a Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 11 Aug 2021 08:37:59 +0300 Subject: vdpa/mlx5: Fix queue type selection logic get_queue_type() comments that splict virtqueue is preferred, however, the actual logic preferred packed virtqueues. Since firmware has not supported packed virtqueues we ended up using split virtqueues as was desired. Since we do not advertise support for packed virtqueues, we add a check to verify split virtqueues are indeed supported. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210811053759.66752-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/linux/mlx5/mlx5_ifc_vdpa.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 98b56b75c625..1a9c9d94cb59 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -11,13 +11,15 @@ enum { }; enum { - MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT = 0x1, // do I check this caps? - MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED = 0x2, + MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT = 0, + MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED = 1, }; enum { - MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT = 0, - MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED = 1, + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT = + BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT), + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED = + BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED), }; struct mlx5_ifc_virtio_q_bits { -- cgit v1.2.3 From 3b844826b6c6affa80755254da322b017358a2f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 5 Aug 2021 10:04:43 -0700 Subject: pipe: avoid unnecessary EPOLLET wakeups under normal loads I had forgotten just how sensitive hackbench is to extra pipe wakeups, and commit 3a34b13a88ca ("pipe: make pipe writes always wake up readers") ended up causing a quite noticeable regression on larger machines. Now, hackbench isn't necessarily a hugely meaningful benchmark, and it's not clear that this matters in real life all that much, but as Mel points out, it's used often enough when comparing kernels and so the performance regression shows up like a sore thumb. It's easy enough to fix at least for the common cases where pipes are used purely for data transfer, and you never have any exciting poll usage at all. So set a special 'poll_usage' flag when there is polling activity, and make the ugly "EPOLLET has crazy legacy expectations" semantics explicit to only that case. I would love to limit it to just the broken EPOLLET case, but the pipe code can't see the difference between epoll and regular select/poll, so any non-read/write waiting will trigger the extra wakeup behavior. That is sufficient for at least the hackbench case. Apart from making the odd extra wakeup cases more explicitly about EPOLLET, this also makes the extra wakeup be at the _end_ of the pipe write, not at the first write chunk. That is actually much saner semantics (as much as you can call any of the legacy edge-triggered expectations for EPOLLET "sane") since it means that you know the wakeup will happen once the write is done, rather than possibly in the middle of one. [ For stable people: I'm putting a "Fixes" tag on this, but I leave it up to you to decide whether you actually want to backport it or not. It likely has no impact outside of synthetic benchmarks - Linus ] Link: https://lore.kernel.org/lkml/20210802024945.GA8372@xsang-OptiPlex-9020/ Fixes: 3a34b13a88ca ("pipe: make pipe writes always wake up readers") Reported-by: kernel test robot Tested-by: Sandeep Patil Tested-by: Mel Gorman Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 5d2705f1d01c..fc5642431b92 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -48,6 +48,7 @@ struct pipe_buffer { * @files: number of struct file referring this pipe (protected by ->i_lock) * @r_counter: reader counter * @w_counter: writer counter + * @poll_usage: is this pipe used for epoll, which has crazy wakeups? * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers @@ -70,6 +71,7 @@ struct pipe_inode_info { unsigned int files; unsigned int r_counter; unsigned int w_counter; + unsigned int poll_usage; struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; -- cgit v1.2.3 From f56ce412a59d7d938b81de8878faef128812482c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 Aug 2021 19:04:21 -0700 Subject: mm: memcontrol: fix occasional OOMs due to proportional memory.low reclaim We've noticed occasional OOM killing when memory.low settings are in effect for cgroups. This is unexpected and undesirable as memory.low is supposed to express non-OOMing memory priorities between cgroups. The reason for this is proportional memory.low reclaim. When cgroups are below their memory.low threshold, reclaim passes them over in the first round, and then retries if it couldn't find pages anywhere else. But when cgroups are slightly above their memory.low setting, page scan force is scaled down and diminished in proportion to the overage, to the point where it can cause reclaim to fail as well - only in that case we currently don't retry, and instead trigger OOM. To fix this, hook proportional reclaim into the same retry logic we have in place for when cgroups are skipped entirely. This way if reclaim fails and some cgroups were scanned with diminished pressure, we'll try another full-force cycle before giving up and OOMing. [akpm@linux-foundation.org: coding-style fixes] Link: https://lkml.kernel.org/r/20210817180506.220056-1-hannes@cmpxchg.org Fixes: 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim") Signed-off-by: Johannes Weiner Reported-by: Leon Yang Reviewed-by: Rik van Riel Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Chris Down Acked-by: Michal Hocko Cc: [5.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bfe5c486f4ad..24797929d8a1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -612,12 +612,15 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root, - struct mem_cgroup *memcg, - bool in_low_reclaim) +static inline void mem_cgroup_protection(struct mem_cgroup *root, + struct mem_cgroup *memcg, + unsigned long *min, + unsigned long *low) { + *min = *low = 0; + if (mem_cgroup_disabled()) - return 0; + return; /* * There is no reclaim protection applied to a targeted reclaim. @@ -653,13 +656,10 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root, * */ if (root == memcg) - return 0; - - if (in_low_reclaim) - return READ_ONCE(memcg->memory.emin); + return; - return max(READ_ONCE(memcg->memory.emin), - READ_ONCE(memcg->memory.elow)); + *min = READ_ONCE(memcg->memory.emin); + *low = READ_ONCE(memcg->memory.elow); } void mem_cgroup_calculate_protection(struct mem_cgroup *root, @@ -1147,11 +1147,12 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root, - struct mem_cgroup *memcg, - bool in_low_reclaim) +static inline void mem_cgroup_protection(struct mem_cgroup *root, + struct mem_cgroup *memcg, + unsigned long *min, + unsigned long *low) { - return 0; + *min = *low = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, -- cgit v1.2.3 From a7cb5d23eaea148f8582229846f8dfff192f05c3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 19 Aug 2021 19:04:30 -0700 Subject: kfence: fix is_kfence_address() for addresses below KFENCE_POOL_SIZE Originally the addr != NULL check was meant to take care of the case where __kfence_pool == NULL (KFENCE is disabled). However, this does not work for addresses where addr > 0 && addr < KFENCE_POOL_SIZE. This can be the case on NULL-deref where addr > 0 && addr < PAGE_SIZE or any other faulting access with addr < KFENCE_POOL_SIZE. While the kernel would likely crash, the stack traces and report might be confusing due to double faults upon KFENCE's attempt to unprotect such an address. Fix it by just checking that __kfence_pool != NULL instead. Link: https://lkml.kernel.org/r/20210818130300.2482437-1-elver@google.com Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure") Signed-off-by: Marco Elver Reported-by: Kuan-Ying Lee Acked-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: [5.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index a70d1ea03532..3fe6dd8a18c1 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -51,10 +51,11 @@ extern atomic_t kfence_allocation_gate; static __always_inline bool is_kfence_address(const void *addr) { /* - * The non-NULL check is required in case the __kfence_pool pointer was - * never initialized; keep it in the slow-path after the range-check. + * The __kfence_pool != NULL check is required to deal with the case + * where __kfence_pool == NULL && addr < KFENCE_POOL_SIZE. Keep it in + * the slow-path after the range-check! */ - return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr); + return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && __kfence_pool); } /** -- cgit v1.2.3